From 900cd3f24723ecfc1aa546f34b017725e809f2d6 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 14 Jun 2023 18:26:09 +0300 Subject: [PATCH 01/43] Switch CSV file for perftests and add Redis step (#1199) --- .github/workflows/performance-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 596d0a348..03541f770 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -60,6 +60,7 @@ jobs: run: | cd ../driver sudo make web sql="${{ secrets.DB_CONN_STRING }}" + sudo make redis - name: Check out delphi-admin uses: actions/checkout@v3 with: @@ -71,7 +72,7 @@ jobs: run: | cd delphi-admin/load-testing/locust docker build -t locust . - export CSV=v4-requests-as_of.csv + export CSV=v4-requests-small.csv touch output_stats.csv && chmod 666 output_stats.csv touch output_stats_history.csv && chmod 666 output_stats_history.csv touch output_failures.csv && chmod 666 output_failures.csv From f5051d33bcf02cd3afbd132e67f3d20ed7024c72 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 11:33:25 -0700 Subject: [PATCH 02/43] feat(acquisition): remove and deactivate norostat --- deploy.json | 9 - .../norostat/norostat_add_history.py | 45 -- src/acquisition/norostat/norostat_raw.py | 112 ----- src/acquisition/norostat/norostat_sql.py | 434 ------------------ src/acquisition/norostat/norostat_update.py | 28 -- src/acquisition/norostat/norostat_utils.py | 44 -- .../norostat/sample_content.pickle | Bin 37801 -> 0 bytes 7 files changed, 672 deletions(-) delete mode 100644 src/acquisition/norostat/norostat_add_history.py delete mode 100644 src/acquisition/norostat/norostat_raw.py delete mode 100644 src/acquisition/norostat/norostat_sql.py delete mode 100644 src/acquisition/norostat/norostat_update.py delete mode 100644 src/acquisition/norostat/norostat_utils.py delete mode 100644 src/acquisition/norostat/sample_content.pickle diff --git a/deploy.json b/deploy.json index 59d141ba4..3396dbbf6 100644 --- a/deploy.json +++ b/deploy.json @@ -138,15 +138,6 @@ "add-header-comment": true }, - "// acquisition - norostat", - { - "type": "move", - "src": "src/acquisition/norostat/", - "dst": "[[package]]/acquisition/norostat/", - "match": "^.*\\.(py)$", - "add-header-comment": true - }, - "// acquisition - paho", { "type": "move", diff --git a/src/acquisition/norostat/norostat_add_history.py b/src/acquisition/norostat/norostat_add_history.py deleted file mode 100644 index 64fd11ff7..000000000 --- a/src/acquisition/norostat/norostat_add_history.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Parses historical versions of the NoroSTAT data-table and updates the -appropriate databases. Currently uses snapshots from the WayBack Machine -(archive.org). A more comprehensive archival service may be mementoweb.org, -which appears to pull from many services that implement the Memento protocol, -including archive.org. Manually downloaded snapshots could be recorded via this -script as well. -""" - -# standard library -import re -import os -import time -import collections - -# first party -from . import norostat_sql -from . import norostat_raw - - - -def main(): - norostat_sql.ensure_tables_exist() - snapshot_dir = os.path.expanduser("~/norostat_history/wayback/websites/www.cdc.gov/norovirus/reporting/norostat/data-table.html/") - snapshot_version_counter = collections.Counter() - for subdir in os.listdir(snapshot_dir): - if re.match(r'[0-9]+', subdir) is not None: - # appears to be snapshot dir - snapshot_version_counter[subdir] = 0 # register that loop found this snapshot directory - for norostat_capitalization in ["norostat","noroSTAT"]: - time.sleep(0.002) # ensure parse times are unique, assuming OS can accurately sleep and measure to ms precision - path = os.path.join(snapshot_dir,subdir,"norovirus","reporting",norostat_capitalization,"data-table.html") - if os.path.isfile(path): - print("Processing file ", path) - with open(path, 'r') as datatable_file: - content = datatable_file.read() - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.record_long_raw(long_raw) - snapshot_version_counter[subdir] += 1 - print('Successfully uploaded the following snapshots, with the count indicating the number of data-table versions found inside each snapshot (expected to be 1, or maybe 2 if there was a change in capitalization; 0 indicates the NoroSTAT page was not found within a snapshot directory); just "Counter()" indicates no snapshot directories were found:', snapshot_version_counter) - norostat_sql.update_point() - -if __name__ == '__main__': - main() diff --git a/src/acquisition/norostat/norostat_raw.py b/src/acquisition/norostat/norostat_raw.py deleted file mode 100644 index 582de9684..000000000 --- a/src/acquisition/norostat/norostat_raw.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Functions to fetch, save, load, and format the NoroSTAT data-table. Formatting -functions include conversion from html content to "wide_raw" --- a wide data -frame in a tuple along with metadata --- and then to "long_raw" --- a long/tall -data frame in a tuple along with metadata. Metadata: release_date, parse_time, -and (constant) location. Here, the location will be (a str representing) a set -of states. -""" - - - -# standard library -import datetime -import re -import pickle - -# third party -import requests -import lxml.html -import pandas as pd - -# first party -from .norostat_utils import * - -def fetch_content(norostat_datatable_url="https://www.cdc.gov/norovirus/reporting/norostat/data-table.html"): - """Download NoroSTAT data-table. Returns the html content.""" - headers = { - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - } - resp = requests.get(norostat_datatable_url, headers=headers) - expect_value_eq(resp.status_code, 200, - 'Wanted status code {}. Received: ') - expect_value_eq(resp.headers.get("Content-Type"), "text/html", - 'Expected Content-Type "{}"; Received ') - return resp.content - -def save_sample_content(content, f="sample_content.pickle"): - """Save the content from fetch_content into a pickle file for most testing (don't download unnecessarily).""" - with open(f, "wb") as handle: - pickle.dump(content, handle) - -def load_sample_content(f="sample_content.pickle"): - """Load data from a past call to fetch_content from a pickle file for most testing (don't download unnecessarily).""" - with open(f, "rb") as handle: - content = pickle.load(handle) - return content - -def parse_content_to_wide_raw(content): - """Convert the html content for the data-table into a wide data frame, then stick it in a tuple along with the release_date, parse_time, and (constant) location.""" - parse_time = datetime.datetime.now() - html_root = lxml.html.fromstring(content) - # Extract the release date, a.k.a. dateModified, a.k.a. "Page last updated" date; ~Dec 2018 this is only available in a meta tag; previously, it was available in a visible span - dateModified_meta_elts = html_root.xpath('//meta[@property="cdc:last_updated"]') - dateModified_span_elts = html_root.xpath('//span[@itemprop="dateModified"]') - if len(dateModified_meta_elts) == 1: - [dateModified_meta_elt] = dateModified_meta_elts - dateModified = dateModified_meta_elt.attrib['content'] - elif len(dateModified_span_elts) == 1: - [dateModified_span_elt] = dateModified_span_elts - dateModified = dateModified_span_elt.text - else: - raise Exception("Could not find the expected number of dateModified meta or span tags.") - # FIXME check/enforce locale - release_date = datetime.datetime.strptime(dateModified, "%B %d, %Y").date() - # Check that table description still specifies suspected&confirmed norovirus - # outbreaks (insensitive to case of certain letters and allowing for both old - # "to the" and new "through the" text), then extract list of states from the - # description: - [description_elt] = html_root.xpath('''//p[ - contains(translate(text(), "SCNORHD", "scnorhd"), "suspected and confirmed norovirus outbreaks reported by state health departments in") and - ( - contains(text(), "to the") or - contains(text(), "through the") - ) - ]''') - location = re.match(".*?[Dd]epartments in (.*?) (?:to)|(?:through) the.*$", description_elt.text).group(1) - # Attempt to find exactly 1 table (note: it would be nice to filter on the - # associated caption, but no such caption is present in earlier versions): - [table] = html_root.xpath('//table') - # Convert html table to DataFrame: - # Directly reading in the table with pd.read_html performs unwanted dtype - # inference, but reveals the column names: - [wide_raw_df_with_unwanted_conversions] = pd.read_html(lxml.html.tostring(table)) - # We want all columns to be string columns. However, there does not appear - # to be an option to disable dtype inference in pd.read_html. Hide all - # entries inside 1-tuple wrappers using pre-dtype-inference converters, - # then unpack afterward (the entries fed to the converters should already - # be strings, but "convert" them to strings just in case): - [wide_raw_df_with_wrappers] = pd.read_html( - lxml.html.tostring(table), - converters= {col: lambda entry: (str(entry),) - for col in wide_raw_df_with_unwanted_conversions.columns} - ) - # Unwrap entries: - wide_raw_df = wide_raw_df_with_wrappers.applymap(lambda wrapper: wrapper[0]) - # Check format: - expect_value_eq(wide_raw_df.columns[0], "Week", - 'Expected raw_colnames[0] to be "{}"; encountered ') - for colname in wide_raw_df.columns: - expect_result_eq(dtype_kind, wide_raw_df[colname].head(), "O", - 'Expected (head of) "%s" column to have dtype kind "{}"; instead had dtype kind & head '%(colname)) - # Pack up df with metadata: - wide_raw = (wide_raw_df, release_date, parse_time, location) - return wide_raw - -def melt_wide_raw_to_long_raw(wide_raw): - (wide_raw_df, release_date, parse_time, location) = wide_raw - long_raw_df = wide_raw_df \ - .melt(id_vars=["Week"], var_name="measurement_type", value_name="value") \ - .rename(index=str, columns={"Week": "week"}) - long_raw = (long_raw_df, release_date, parse_time, location) - return long_raw diff --git a/src/acquisition/norostat/norostat_sql.py b/src/acquisition/norostat/norostat_sql.py deleted file mode 100644 index 168e275eb..000000000 --- a/src/acquisition/norostat/norostat_sql.py +++ /dev/null @@ -1,434 +0,0 @@ -# standard library -import re - -# third party -import mysql.connector - -# first party -from .norostat_utils import * -import delphi.operations.secrets as secrets - -# Column names: -# `release_date` :: release date as stated in the web page in the dateModified -# span, displayed on the web page with the label "Page last updated:" -# `parse_time` :: time that we attempted to parse the data out of a downloaded -# version of the web page; when the scraper is running, this may be similar -# to a fetch time, but when loading in past versions that have been saved, -# it probably won't mean the same thing; this is tracked (a) in case the -# provided release date ever is out of date so that the raw data will still -# be recorded and we can recover later on, and (b) to provide a record of -# when parses/fetches happened; if there is a request for the data for a -# particular `release_date` with no restrictions on `parse_time`, the -# version with the latest `parse_time` should be selected -# (`release_date`, `parse_time`) :: uniquely identify a version of the table -# `measurement_type_id` :: "pointer" to an interned measurement_type string -# `measurement_type` :: the name of some column other than "Week" in the -# data-table -# `location_id` :: "pointer" to an interned location string -# `location` :: a string containing the list of reporting states -# `week_id` :: "pointer" to an interned week string -# `week` :: a string entry from the "Week" column -# `value` :: an string entry from some column other than "Week" in the -# data-table -# `new_value` :: an update to a `value` provided by a new version of the data -# table: either a string representing an added or revised entry (or a -# redundant repetition of a value retained from a past issue --- although -# no such entries should be generated by the code in this file), or NULL -# representing a deletion of a cell/entry from the table -# -# Tables: -# `norostat_raw_datatable_version_list` :: list of all versions of the raw -# data-table that have ever been successfully parsed -# `_pool` :: maps each encountered value of string `` to a unique ID -# `_id`, so that the string's character data is not duplicated in the -# tables on disk; serves a purpose similar to Java's interned string pool -# `norostat_raw_datatable_diffs` :: contains diffs between consecutive versions -# of the raw data-table (when arranged according to the tuple -# (`release_date`,`parse_time`) using lexicographical tuple ordering) -# `norostat_raw_datatable_parsed` :: a temporary table to hold the version of -# the raw data-table (in long/melted format) to be recorded; uses string -# values instead of interned string id's, so will need to be joined with -# `*_pool` tables for operations with other tables -# `norostat_raw_datatable_previous` :: a temporary table to hold an -# already-recorded version of the raw data-table with the latest -# `release_date`, `parse_time` before those of the version to be recorded; -# if there is no such version, this table will be empty (as if we recorded -# an empty version of the table before all other versions); uses interned -# string id's -# `norostat_raw_datatable_next` :: a temporary table to hold an -# already-recorded version of the raw data-table with the earliest -# `release_date`, `parse_time` after those of the version to be recorded; -# if there is no such version, this table will not be created or used; uses -# interned string id's - -def ensure_tables_exist(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_version_list` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - PRIMARY KEY (`release_date`, `parse_time`) - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_measurement_type_pool` ( - `measurement_type_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `measurement_type` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_location_pool` ( - `location_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `location` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_week_pool` ( - `week_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `week` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_diffs` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), - FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - UNIQUE KEY (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`), - PRIMARY KEY (`release_date`, `parse_time`, `measurement_type_id`, `location_id`, `week_id`) - -- (the indices here are larger than the data, but reducing the key - -- sizes and adding an id somehow seems to result in larger index sizes - -- somehow) - ); - ''') - cnx.commit() - finally: - cnx.close() - -def dangerously_drop_all_norostat_tables(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - # Drop tables in reverse order (to avoid foreign key related errors): - cursor.execute(''' - DROP TABLE IF EXISTS `norostat_point_diffs`, - `norostat_point_version_list`, - `norostat_raw_datatable_diffs`, - `norostat_raw_datatable_week_pool`, - `norostat_raw_datatable_location_pool`, - `norostat_raw_datatable_measurement_type_pool`, - `norostat_raw_datatable_version_list`; - ''') - cnx.commit() # (might do nothing; each DROP commits itself anyway) - finally: - cnx.close() - -def record_long_raw(long_raw): - (long_raw_df, release_date, parse_time, location) = long_raw - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cnx.start_transaction(isolation_level='SERIALIZABLE') - # Create, populate `norostat_raw_datatable_parsed`: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_parsed` ( - `measurement_type` NVARCHAR(255) NOT NULL, - `location` NVARCHAR(255) NOT NULL, - `week` NVARCHAR(255) NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - PRIMARY KEY (`measurement_type`, `location`, `week`) - ) ENGINE=MEMORY; - ''') - cursor.executemany(''' - INSERT INTO `norostat_raw_datatable_parsed` (`week`,`measurement_type`,`value`,`location`) - VALUES (%s, %s, %s, %s); - ''', [(week, measurement_type, value, location) for - (week, measurement_type, value) in long_raw_df[["week","measurement_type","value"]].astype(str).itertuples(index=False, name=None) - ]) - # Create, populate `norostat_raw_datatable_previous`: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_previous` ( - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) - ) ENGINE=MEMORY; - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_previous` (`measurement_type_id`, `location_id`, `week_id`, `value`) - SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` - FROM `norostat_raw_datatable_diffs` AS `latest` - -- Get the latest `new_value` by "group" (measurement_type, location, week) - -- using the fact that there are no later measurements belonging to the - -- same group (find NULL entries in `later`.{release_date,parse_time} - -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't - -- include it in the result; it means that the corresponding cell/entry - -- has been removed from the data-table: - LEFT JOIN ( - SELECT * FROM `norostat_raw_datatable_diffs` - WHERE (`release_date`,`parse_time`) <= (%s,%s) - ) `later` - ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND - `latest`.`location_id` = `later`.`location_id` AND - `latest`.`week_id` = `later`.`week_id` AND - (`latest`.`release_date`, `latest`.`parse_time`) < - (`later`.`release_date`, `later`.`parse_time`) - WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND - `later`.`parse_time` IS NULL AND - `latest`.`new_value` IS NOT NULL; - ''', (release_date, parse_time, release_date, parse_time)) - # Find next recorded `release_date`, `parse_time` if any; create, populate - # `norostat_raw_datatable_next` if there is such a version: - cursor.execute(''' - SELECT `release_date`, `parse_time` - FROM `norostat_raw_datatable_version_list` - WHERE (`release_date`, `parse_time`) > (%s,%s) - ORDER BY `release_date`, `parse_time` - LIMIT 1 - ''', (release_date, parse_time)) - next_version_if_any = cursor.fetchall() - expect_result_in(len, next_version_if_any, (0,1), - 'Bug: expected next-version query to return a number of results in {}; instead have len & val ') - if len(next_version_if_any) != 0: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_next` ( - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) - ) ENGINE=MEMORY; - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_next` (`measurement_type_id`, `location_id`, `week_id`, `value`) - SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` - FROM `norostat_raw_datatable_diffs` AS `latest` - -- Get the latest `new_value` by "group" (measurement_type, location, week) - -- using the fact that there are no later measurements belonging to the - -- same group (find NULL entries in `later`.{release_date,parse_time} - -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't - -- include it in the result; it means that the corresponding cell/entry - -- has been removed from the data-table: - LEFT JOIN ( - SELECT * FROM `norostat_raw_datatable_diffs` - WHERE (`release_date`,`parse_time`) <= (%s, %s) - ) `later` - ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND - `latest`.`location_id` = `later`.`location_id` AND - `latest`.`week_id` = `later`.`week_id` AND - (`latest`.`release_date`, `latest`.`parse_time`) < - (`later`.`release_date`, `later`.`parse_time`) - WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND - `later`.`parse_time` IS NULL AND - `latest`.`new_value` IS NOT NULL -- NULL means value was removed - ''', next_version_if_any[0]+next_version_if_any[0]) - # Register new version in version list: - try: - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_version_list` (`release_date`, `parse_time`) - VALUES (%s, %s) - ''', (release_date, parse_time)) - except mysql.connector.errors.IntegrityError as e: - raise Exception(['Encountered an IntegrityError when updating the norostat_raw_datatable_version_list table; this probably indicates that a version with the same `release_date` and `parse_time` was already added to the database; parse_time has limited resolution, so this can happen from populating the database too quickly when there are duplicate release dates; original error: ', e]) - # Add any new measurement_type, location, or week strings to the associated - # string pools: - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_measurement_type_pool` (`measurement_type`) - SELECT DISTINCT `measurement_type` - FROM `norostat_raw_datatable_parsed` - WHERE `measurement_type` NOT IN ( - SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type` - FROM `norostat_raw_datatable_measurement_type_pool` - ); - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_location_pool` (`location`) - SELECT DISTINCT `location` - FROM `norostat_raw_datatable_parsed` - WHERE `location` NOT IN ( - SELECT `norostat_raw_datatable_location_pool`.`location` - FROM `norostat_raw_datatable_location_pool` - ); - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_week_pool` (`week`) - SELECT DISTINCT `week` - FROM `norostat_raw_datatable_parsed` - WHERE `week` NOT IN ( - SELECT `norostat_raw_datatable_week_pool`.`week` - FROM `norostat_raw_datatable_week_pool` - ); - ''') - # Record diff: [newly parsed version "minus" previous version] (first, - # record additions/updates, then record deletions): - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( - SELECT `norostat_raw_datatable_previous`.`measurement_type_id`, - `norostat_raw_datatable_previous`.`location_id`, - `norostat_raw_datatable_previous`.`week_id`, - `norostat_raw_datatable_previous`.`value` - FROM `norostat_raw_datatable_previous` - ); - ''', (release_date, parse_time)) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL - FROM `norostat_raw_datatable_previous` - WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( - SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, - `norostat_raw_datatable_location_pool`.`location_id`, - `norostat_raw_datatable_week_pool`.`week_id` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - ); - ''', (release_date, parse_time)) - # If there is an already-recorded next version, its diff is invalidated by - # the insertion of the newly parsed version; delete the [next version - # "minus" previous version] diff and record the [next version "minus" newly - # parsed] diff: - if len(next_version_if_any) != 0: - cursor.execute(''' - DELETE FROM `norostat_raw_datatable_diffs` - WHERE `release_date`=%s AND `parse_time`=%s; - ''', next_version_if_any[0]) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` - FROM `norostat_raw_datatable_next` - WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( - SELECT - `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, - `norostat_raw_datatable_location_pool`.`location_id`, - `norostat_raw_datatable_week_pool`.`week_id`, - `norostat_raw_datatable_parsed`.`value` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - ); - ''', next_version_if_any[0]) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( - SELECT `norostat_raw_datatable_next`.`measurement_type_id`, - `norostat_raw_datatable_next`.`location_id`, - `norostat_raw_datatable_next`.`week_id` - FROM `norostat_raw_datatable_next` - ); - ''', next_version_if_any[0]) - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_point_version_list` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), - PRIMARY KEY (`release_date`, `parse_time`) - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_point_diffs` ( - `release_date` DATE NOT NULL, - `parse_time` datetime(6) NOT NULL, - `location_id` INT NOT NULL, - `epiweek` INT NOT NULL, - `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_point_version_list` (`release_date`,`parse_time`), - FOREIGN KEY (`location_id`) REFERENCES norostat_raw_datatable_location_pool (`location_id`), - UNIQUE KEY (`location_id`, `epiweek`, `release_date`, `parse_time`, `new_value`), - PRIMARY KEY (`release_date`, `parse_time`, `location_id`, `epiweek`) - ); - ''') - cnx.commit() # (might do nothing; each statement above takes effect and/or commits immediately) - finally: - cnx.close() - -def update_point(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cnx.start_transaction(isolation_level='serializable') - cursor.execute(''' - SELECT `release_date`, `parse_time`, `measurement_type`, `location_id`, `week`, `new_value` - FROM `norostat_raw_datatable_diffs` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type_id`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week_id`) - WHERE (`release_date`, `parse_time`) NOT IN ( - SELECT `norostat_point_version_list`.`release_date`, - `norostat_point_version_list`.`parse_time` - FROM `norostat_point_version_list` - ); - ''') - raw_datatable_diff_selection = cursor.fetchall() - prog = re.compile(r"[0-9]+-[0-9]+$") - point_diff_insertion = [ - (release_date, parse_time, location_id, - season_db_to_epiweek(measurement_type, week), - int(new_value_str) if new_value_str is not None else None - ) - for (release_date, parse_time, measurement_type, location_id, week, new_value_str) - in raw_datatable_diff_selection - if prog.match(measurement_type) is not None and - new_value_str != "" - ] - cursor.execute(''' - INSERT INTO `norostat_point_version_list` (`release_date`, `parse_time`) - SELECT DISTINCT `release_date`, `parse_time` - FROM `norostat_raw_datatable_version_list` - WHERE (`release_date`, `parse_time`) NOT IN ( - SELECT `norostat_point_version_list`.`release_date`, - `norostat_point_version_list`.`parse_time` - FROM `norostat_point_version_list` - ); - ''') - cursor.executemany(''' - INSERT INTO `norostat_point_diffs` (`release_date`, `parse_time`, `location_id`, `epiweek`, `new_value`) - VALUES (%s, %s, %s, %s, %s) - ''', point_diff_insertion) - cnx.commit() - finally: - cnx.close() - -# note there are more efficient ways to calculate diffs without forming ..._next table -# todo give indices names -# todo trim pool functionality for if data is deleted? -# todo make classes to handle pool, keyval store, and diff table query formation -# todo test mode w/ rollback -# todo record position of rows and columns in raw data-table (using additional diff tables) -# todo consider measurement index mapping to another id -# todo add fetch_time to version list -# xxx replace "import *"'s -# xxx should cursor be closed? -# xxx is cnx auto-closed on errors? -# xxx drop temporary tables? -# fixme time zone issues diff --git a/src/acquisition/norostat/norostat_update.py b/src/acquisition/norostat/norostat_update.py deleted file mode 100644 index 4b0021dd5..000000000 --- a/src/acquisition/norostat/norostat_update.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -=============== -=== Purpose === -=============== - -Fetch NoroSTAT data table from -; -process and record it in the appropriate databases. -""" - -# first party -from . import norostat_sql -from . import norostat_raw - - -def main(): - # Download the data: - # content = norostat_raw.load_sample_content() - content = norostat_raw.fetch_content() - # norostat_raw.save_sample_content(content) - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.ensure_tables_exist() - norostat_sql.record_long_raw(long_raw) - norostat_sql.update_point() - -if __name__ == '__main__': - main() diff --git a/src/acquisition/norostat/norostat_utils.py b/src/acquisition/norostat/norostat_utils.py deleted file mode 100644 index a99a4dc96..000000000 --- a/src/acquisition/norostat/norostat_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# standard library -import re -import datetime - -# first party -from delphi.utils.epidate import EpiDate - -# helper funs for checking expectations, throwing exceptions on violations: -def expect_value_eq(encountered, expected, mismatch_format): - if encountered != expected: - raise Exception([mismatch_format.format(expected), encountered]) -def expect_result_eq(f, value, expected, mismatch_format): - result = f(value) - if result != expected: - raise Exception([mismatch_format.format(expected), result, value]) -def expect_value_in(encountered, expected_candidates, mismatch_format): - if encountered not in expected_candidates: - raise Exception([mismatch_format.format(expected_candidates), encountered]) -def expect_result_in(f, value, expected_candidates, mismatch_format): - result = f(value) - if result not in expected_candidates: - raise Exception([mismatch_format.format(expected_candidates), result, value]) -def expect_str_contains(encountered, regex, mismatch_format): - if re.search(regex, encountered) is None: - raise Exception([mismatch_format.format(regex), encountered]) - -# helper fun used with expect_* funs to check value of .dtype.kind: -def dtype_kind(numpy_like): - return numpy_like.dtype.kind - -# helper fun used to convert season string ("YYYY-YY" or "YYYY-YYYY") and -# "Week" string (strptime format "%d-%b") to the corresponding epiweek; assumes -# by default that dates >= 1-Aug correspond to weeks of the first year: -def season_db_to_epiweek(season_str, db_date_str, first_db_date_of_season_str="1-Aug"): - year_strs = season_str.split("-") - first_year = int(year_strs[0]) - second_year = first_year + 1 - # FIXME check/enforce locale - first_date_of_season = datetime.datetime.strptime(first_db_date_of_season_str+"-"+str(first_year), "%d-%b-%Y").date() - date_using_first_year = datetime.datetime.strptime(db_date_str+"-"+str(first_year), "%d-%b-%Y").date() - date_using_second_year = datetime.datetime.strptime(db_date_str+"-"+str(second_year), "%d-%b-%Y").date() - date = date_using_first_year if date_using_first_year >= first_date_of_season else date_using_second_year - epiweek = EpiDate(date.year, date.month, date.day).get_ew() - return epiweek diff --git a/src/acquisition/norostat/sample_content.pickle b/src/acquisition/norostat/sample_content.pickle deleted file mode 100644 index 1518dde0deb517bb8641573d685ba808d9e39c8d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 37801 zcmeHwZFAd5lIBF*Cp)75Lp9xJOFaT1l6tc(aYB~lwt8(zE6VnE_jM>hvM9m^0U7`$ z%N|F}f4KW{pYFfyFWD!vUQpnRM9RH8J9F;cu|S|ID=RBAD=RB2>u>+#+5h|BfBMt* z_xs0tXTO{rh<-kZn%~{u{@(L`3wt8U#o>Y2`kh|UC(((5EZeEXiT5rOad0W}z8pyJ zB9$^mX)lzuRrU9=`m-e}A`0S*or;XTQC4Y+?eBk+aX0M!?s-m2cW-8H_>ULt#(aRk zKRo!oN%WeJMrELa$l-ZoBSCQjurNpsm1c_OGp>umRIMeGNF^;faCzqjqH+TY!`ZN$Mq?o_%m z>!jf@50kheI!T<%7|l}qxybs-RTy80cr<9sR3tso2{M_{Lu|1~3=UO|CU-yqe=o1E zlC+ySkPUL83*1(w!4awMi~5@7LB1mLpdCpxpy{K)UxxCEy6u3!3cGoKrzk4G~9@Y!*z?3F0IU!L(+8`guOgp7{RN)m0T}R49Fh?PQrBg>Ao`V9@e37}nRSc&CZy5N-i@kb8phmqxW- zZB(}mEIk!o@KlI7gmwL-)@D-BoQauilV+N)=~fTQ*9)#>mSA!X<)+)W()4LtRWmhD zad&Mvls7a}u%b)Rf$WCCPGy*e@$KWqrKV9>%{n4WJ2P149S?Tq50b7-N^+|+rFyL^J4LMFYGOSk4aBfwDu0WAud<3e?k(lZO|NEYL7%>S(S+#*z~>8r%6H^a zH|dN9unLGs2N5j4IM1G4p9L33Fd&vHtXP(RTm8Mtb|Ss3X2U4Vmnx#Nyi!rdVrBU$ zdTkN3Fgq|`a;d>CRfywIG_LZb>~I}k<<~>lE4lob`y};vqd5`d4@;C@+@d@fmCUdF zIsbEo+5V};V@BVgwJO{<=m{9Gdmv|bDaKGN^T--r!O-R4lMEyO`3t6xE{u?ne#N6G zA`Wo)m3}!U5*Z%Q@x+}TO-jzh0JDu${A{g#<1|##XHJ8I%x&@v{IQorN$RxwV0Cq) z76e8Y&a9h#7zL@^T>ZGY8YLHr17Z4Hpsb{~VY~&oisU)h#HB} zzdN<1>(130%tqO$-MuYccdoWn&!`6ecc-4|x-(jv&qN-xkXN!kl>2)avtBREcjBbN zQ`%wp#TVE|z)H5wg^lN^E z@(n~wFEWnoB4dvi8E0hin`MhK8rbkSI(Vxa-))KG=VHUJ6vuIJ)gD?zYtmza)6oPdb zo&>a3$8r-IaBy*hX{XoAgCN6<9CXjKQJWa4nq6L0L=eIM z@LzzZk&u4a?ZQo!#Jt?2*?BSKYmmC=?}Pefn1xW11@204v~(OEw;l`Tv-aphzz_2t z>)HhT0~I_7-b;~70S;0-?!sdVeuqp~@fhU1vB|>(Zk~^({v20bYBhm2n#K9$u_$oDB)?6gStD;~R z^O#?m`H|Wn2%0FWC>AS5G9OgnJ|(AHY0hZ2c+mg~w|FWs0?7*1;=c`%9T>=}{&T#k z2$*O61R9N}pMaM+P;c5(pbA=Nf%>VnUN4kU7nW-os7LL1uChg^B6$Jpx2f@?b{!=3 zN2amcebtzP*Xhg7`*!lt=)mnLXiMu^hWrB$5Gss^aPBBW-&Qd+Q-$Cf1t^IQ>xYWC z45AT;N~=N!t2K8q@>8(lmd3hS8a$1Cdz9x;&Ia#$+vn z*GUc-_<}<$+=A;Ta68GT5D{pX5DU5MCyJAws+9vA$Vr*pA;&FX;6^D>gKBWpW1(V2x?2ZTm*xXxQ-!Qe+sQtT%%CCr@+4 z5bH-T4KMn+(KZ+w^T>+FRW4-MC`+}N;Aj$++t5QMTKp#K>{|Dp0%y-ZI5U_iuf!Fxy0qL z#-BA=tToxj=7LN;IC1j?N~;f6S{uSBne)3&e1z>Mg&cCx|OxkUOIvz6og z65sFah(~0a_OSZsK3W!^-Oq5;#G~^lfyrBPjfpQHI`2MdN;{J$QyLy9u#U1R_2fz= z4S#?=iqSSxp6eVBKKX(2IyZOA-J7FkTi1NHaLtaciCs5`k>U=Y?y&*ERoT;hK@I`Euc!16}jg!ZqOr zrM;f!(ZV$oUGsS1nnPXlWZ{}0bj^QVxMr$ro-SN7(=}TQ*UWXzvxRGpbj|YQGLKHU z)HUBM+~$?8`F7!&A9c-tTe#-6uKCNtH9zT^zb;&JRo8sr*3>iDA2WerHoAHKDjUV@ zH5xaRYCbkoDPly=WXh??OFmj?dNAk6l`tAmAV%Rhs;}uaD;#n>8(Q;k!kSXB%f_U( zZi{*H+alDgOuj8bX>Dtn=G!9l|LaAlQIw>erY`hK`U@qX$XZy=(ds)+lo@ZMLa?|M z*tidS*W?+dBG@;^4oTd(hQlh$$lELSkHyjP8SR#e5bImG@~hw7XASvvs}0-K%_>_d zV(#h<4t{o=(_V;?6VOKy=xrb2kG$z%_F)C~d`B5}AE^5Q{G`LMBl-yE;KdM;fhtIR zjv^IVhk&M0)D>-V{)_E?t%*Rli>jR_&{Dr%sBS0Nhj;Pyv}d>3X!^U4O{{v8kTm~| z53D7R-ZwBlPy3!W1k^4P^_iG~{z$YH;pS?J z*KsEqVEqTT9&L$dv{`^a4BG2p_YV`qHf@;0a6{q0xuyMVXc2h$ptezax(h5i;ll^@ zwZ_v}p>hyWGzVA_jzJ`q0$mw^$n3>%6W zIp@0qlm|`Y1fiOvp$-6TUyEznu4!XyDwAndz+J0o4;uS#wMvyo;KqRN;VBW{O z3ZsZvaRg^c5Wy9bM zttAkI)Mr5F@RT9WL38si!-k)PVjqa3tA>HM!T;7>@O3J}0J>d6n8q>0N=u9n+W3y4 zk`!`0(HC%;)*LPaJG+LgYa`jqIkH-DkCDjfTe{krR(1pma^bCWQ`@MS8b_*9#dZMI^6;yyB-YT?Xzn* z{nMKMw^T>mV4MxYjKVfG$DKK^<^?k>HcHiMyVyCM7eo$u%t;MdH1ux=0D0k{(LvLh z-GZE!$odkLPJ=v2ui^Z)B1AUb2z3At=2 zJBVw;MhT`prT#U{mi~gxB=coHkO)A$fCz_%37kIfn@!o& zXC0ep_uC0N0kS=Uw>tJ-w{-WP@o(B%wL%~cqbP=b1Z~R`Q6fnXR^RVOST)j zr9Di+Q1vYnrdDdQ)TNtwZK5IUDo0(4m(mRt9eHcM+yQ+R5z&R|sKK6chy=Z$&F7+# zX{gUJ*hQqYtB=$X3ToqFFc>}+txs9ta~!|#vL|spMa>b=d6~lka%gMdEKFJp?2l%&0`N>eGlCAx3 zb!a{x^MgjFY)>eRMD;c(ETv>k{jEj&pMaArmiHOz>Spo`?42cTB-)w@Z%IERfwFW*NQk|Zxy8U zFoyY?k1+SsYD>}iA4KTP%!7q@&c~0;takHcMy5yyZo6vS$wpP#9Zh3ABHA9E-(Y#e z3Lieti|c;?!kFkJp2xNG)wL$r{mOH{eJXyzDXZPHsyhB<-yFH@dT_xYDw_6VjU~gU za&1X^a6NhCcB?%u$~pxnTH(6mO_!k2Vg7C>#Yq@MV%xZM_U+k5+}63g#}%CNtD;f&i$l#KrqXuV_RT(&=~eW3!kiBOnnSj+=FeZ?O9b)?aiL|5HE|<&+qMdj_3A;I{NrvEbz~X z%ZGifc4Dw?5DQb$#C}qzR6qhD?lh#)FfCgkl+>e$8^yTKL^{6;u_c69PkjdE%>2F` z<+CfC2{+A*p=iNs4E1lWGJ)I-f||n$4|ukXrj49HRQf!JSri0cAN}+ib#)+*PqQ#H z&~eIA94?RSGmi5-fVa;%L4kd#0H>W1A&(tt8V`hc)1MM0T>j8da13e&6gU9cAA)U6 z`)HskHooQSBF8y+Ul%BD`5_DRx*fX`_NU!bwhf;S|Y;7TpT(T^^#uT)F)mNf^WkNbjZl*Oy}sNvuOuIf<2tZlrk$CjW${9t?LXcS+PW3&}AA} z>d9Lv-;=Rr9NNb?;f6de_0`qK9v#5^@nJW(xRB{n__v3cu)rQTZlswSCkj16hTrkO zi1DUSZ*nl)-|hr_;?R_KnBK{bV-R(6WdV&FO&X3lm&__8DVfr}5{LuBIQ| zX3g6j!MB!k-|(||Bt-RF=W$)XRbUl0y3MUJl+~hYtx-HXk2O=I45zj}6A*RRAZaS& z#dLh?^?Bf|6(9}tGCK9gWia)kf>YOdGzH3F*1eV-7F8V@n`Y4w=W$DyEkmi*%V11f zsB5W-XH(e$IR@wPOgN7VB%MypAn`^Sj8nluf!SRTn;HHxIt}l*lNaqey>h@f3|KP_ zmBG|jXTou*IhSJ7QL5KwLaIBE>V|z> zl9h3)Zy?xm!=;YwB?ht&<>9u`d1_Y@sL2}1v&Be*! z0h~4seq*2>kAZSAaYk&-&>5g?6J=2KjWJLTCL7L>xuDh!*fCJv&p}*J%CZX@-!L_( zPq+8)aT!wGP+<(xy3mpc)RS4O4~s=TOk{ zS{YPhbqthK#SS@j*1>89(Ko#3vRx3p;gD+6>7T6)sA~rCGE%k17$|3)woDaNx9pAQ zh8fo~sQT&{D2F^7jxu+prf!&R4Af>BlskZrO%-5LH<%DshMZ+k8{VrREl5>&5OK%M z>Vet)1F1@&T;1jhjU^92SyfX;3ZyE7a!6GgG=~-HW@N@dZH|F*#=q1-2UNq5unel< z?G7DMx&2#n$W|h{VUaQ<%*G{9?l75d(D1o9nddP`n`KB29<~<*$&modEM-XbwK61! zzP*vYGm($;K14t5TFGqzHe85D)A=$O zOv5EOPG=nQI&ZD*Fa`&cl#7o+nmXW;+Uc;ttqce=@GL-vq)un>XlPyrgV||>ECcw? z_MC%`1I6i~DNu$2aV>#ydt%x}6Q(B)Vq-JHR4@Dr>NNxV;$`Xt5PXkD7YFJ znVcc?18e+sueL$w#1P<({jO`ADtoHCu<)vLxI@No z5ieE|Lc^L2EdIBMmx37IB3=-*I7hIe+1B|M@!~{r-Yw##*ePYB$Hy5L?AvjmE`p=V zmXF%I)c#FZUjHM6+Mhb z*pu~CK0Ui}t==B@dR$?sscxTn0N=BS#QJRZ@b+DCsu(Wf9B9A_mRf#OM%`(>1Xh?l8wWME7Hbf;~B`^^}s(>J&5%8*Q za#L{T-q4ga^7v3B1t2P0QXN=(Vy#4bsL@Q~w zB<=ccd!v*x=$W85mA9Hbq=$pS$e_SpP&%6&>I3&WrFus8(25&G$ME;x?5hKI%v-8o z+|GVTrY53t5ibX)Djc{jAbliToxkIOw6={JR{UFD9?45knq5~v>TV8V2v&X^%LHUHuWdVaK`gw!4_q+Nk~ghGJHivI3CUw;P=y1GCL z7u9HJ$VS{MBB+^Es)3Ru6(y8G?6_Mku1e525r#JXRLUJ-qh-+8s2~-N$IYR;A>?7C zq+^D$gIx+_EFnWqO9P9lZr_bjJRne}5L?dXgx+jqO#C&*W8mZX8O765WbRJoyp0p; z$P_cp{sKy-;_YbJ(C!6L(?Qn;E4r;_Uc^lN_O0Y91%b~=E@Z&TM{V3~1!^H+HYJkb z$}kxq0nmjy|1&qD8rJOu{3RKkq0t4%N4aW4oJ|TYV4^uhgesbIcx~<5(W669+|X16 ztp>3teDAsP1Igi|7juhGy<@Y^50ftLjl?~Bvv`y?2`GbfiiT z3K$Omxiv7Y>TgiA?x${t4{#esaX*=?q67KD^^an?S> z(Zc7)hvkIiw__n?g5&8pZPGzDbL{go8Mjf+88g6#R z0TI{`$A@F7yvgV!+1FVg3{SuU^rsteld`Jpd&6C>X=>CF-nm+$+o2T#I%&kp{dj{4 zL3fQ3h>*H3Qnz7QFtCzuG(CTcl6Da3rfaVCsdE-GArLBoPnB>PC_woGkfbye&|VGlWCk)8KQ!9m`@^@j)>y@WNY_ z2?5VHnJ_aYnHh^TkeO!jTqq4qM`N5UqAu$H4b{cpl`UpZF_h8H$*)x8V-OF;i_rk5 zkXtH8iZ*V>dBO-Ra`NzhH;dUHP6X5J{1ReF;lqDeWOlc5@Zv?wKgO4m1ejLFG=Y6h zL@}Z@F;D#AeJ1}Bw>XDBsE)EgrA{*V*`7lG(w9<4Nf^xtf3>K)QaCB;kdrGUhQ%HvGK36ZYgnL&zDeyWSiHY&%m$q;z`}6XxT@f8y1_+3De43xUnZ^W7%GA@G?( zx+jVjFUm=nalayAKxTnV@pcjBeU1E4dYF;-WwOhMdKPF3??|@tWO%Qa{hI)kG0P;skdM(vH*~c7BopawHQvZ1RPmOUNhlFq8Q zf^S8@fyC+imvq&8S(Uc2)*fRTsnI?6_r5IipLt={Z-!aF3(UsNFdKJ)i7nNe`o8AC z{K9iR2m@I;qT5~(t25#>-<<#E>3#U{`T-JgAi0YTzOLZ4!>FKp{*u_Owp8K0$;$Fm zUhz;&mP~OC^wHko={^)1>2|T@*VgF@^g7fJ^CMl*&he! z%{tmNLE+Ty^iqwb%CJAg_vxS4mMUHBg=w?}UD~b?S`$cp2p+Wfu*#(5DfoZIPW7ED zWQs|ys^rF@w8d4*6jaTVmlRw@x5Fdr91NCc?DDq8^ z{xPr^+1DH+z-NwF0pe;@Z3vh=Sdh|OhH6b!+Lc~E5zB{vuqK7Q!{*Uf|=0=;{muCm3V)y8m zvloX)KZ$1tFOS~}Jh#Nl!=nT7y>Ki2_@n>beGpZsX<{d<=FHwA1J_=vC|R;L&n(XBAB%E>x>6 z)g~fKGNiweOOO6%WAD-OpCA3N@4ZKVe&juZh>zi6F4vgcRws{otyZtrlg%m)w81X1 zxwT9f=2#IQ_XlJ?USj3w*L<3?J@2UHue#4j<0a{i+?`6T0uk!weSYHHTL{V#)>z^k zTMyr5jMKP8vuhVPcTt))Ek zPan52(-7v8x>s(h9YXjn+`L|(a)%2e*gTajXo*V_=Cq%+E%J3{KiVs=P)kNX&$Pzq z`C-X$>@oy6=w2@sGxleaFEG(d+=xPP?>i!vS1jB54W>Nj7LyfGaWVmy^N3T{Lm$CV zI#>3J$kP$>Y4wmn*!;nqShNLPa4l{Mk-iVx$PrgP Date: Fri, 23 Jun 2023 11:33:56 -0700 Subject: [PATCH 03/43] docs(norostat): note that updates stopped November 2020 --- docs/api/norostat.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/api/norostat.md b/docs/api/norostat.md index 6e801116c..734dd9aa9 100644 --- a/docs/api/norostat.md +++ b/docs/api/norostat.md @@ -13,6 +13,8 @@ General topics not specific to any particular endpoint are discussed in the [contributing](README.md#contributing), [citing](README.md#citing), and [data licensing](README.md#data-licensing). +**NOTE**: This data source stopped acquiring data in November 2020. + ## NoroSTAT Data ... From 32b05e976d66f50c6b0d7c5e04bf4f2923e503a5 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 11:38:18 -0700 Subject: [PATCH 04/43] docs(norostat): correct wording --- docs/api/norostat.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/norostat.md b/docs/api/norostat.md index 734dd9aa9..dded4ec13 100644 --- a/docs/api/norostat.md +++ b/docs/api/norostat.md @@ -13,7 +13,7 @@ General topics not specific to any particular endpoint are discussed in the [contributing](README.md#contributing), [citing](README.md#citing), and [data licensing](README.md#data-licensing). -**NOTE**: This data source stopped acquiring data in November 2020. +**NOTE**: Delphi stopped stopped acquiring data from this data source in November 2020. ## NoroSTAT Data From c7e45c1a23873cddfeaf396742c8c7ac1240d8d2 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 14:55:03 -0700 Subject: [PATCH 05/43] feat(afhsb): remove afhsb acquisition code --- src/acquisition/afhsb/afhsb_csv.py | 351 -------------------------- src/acquisition/afhsb/afhsb_sql.py | 194 -------------- src/acquisition/afhsb/afhsb_update.py | 39 --- 3 files changed, 584 deletions(-) delete mode 100644 src/acquisition/afhsb/afhsb_csv.py delete mode 100644 src/acquisition/afhsb/afhsb_sql.py delete mode 100644 src/acquisition/afhsb/afhsb_update.py diff --git a/src/acquisition/afhsb/afhsb_csv.py b/src/acquisition/afhsb/afhsb_csv.py deleted file mode 100644 index b839c4053..000000000 --- a/src/acquisition/afhsb/afhsb_csv.py +++ /dev/null @@ -1,351 +0,0 @@ -''' -afhsb_csv.py creates CSV files filled_00to13.csv, filled_13to17.csv and simple_DMISID_FY2018.csv -which will be later used to create MYSQL data tables. - -Several intermediate files will be created, including: -00to13.pickle 13to17.pickle 00to13.csv 13to17.csv - -Required source files: -ili_1_2000_5_2013_new.sas7bdat and ili_1_2013_11_2017_new.sas7bdat under SOURCE_DIR -country_codes.csv and DMISID_FY2018.csv under TARGET_DIR -All intermediate files and final csv files will be stored in TARGET_DIR -''' - -import csv -import os - -import pickle -import sas7bdat -import epiweeks as epi - - -DATAPATH = '/home/automation/afhsb_data' -SOURCE_DIR = DATAPATH -TARGET_DIR = DATAPATH - -INVALID_DMISIDS = set() - -def get_flu_cat(dx): - # flu1 (influenza) - if len(dx) == 0: - return None - dx = dx.capitalize() - if dx.isnumeric(): - for prefix in ["487", "488"]: - if dx.startswith(prefix): - return 1 - for i in range(0, 7): - prefix = str(480 + i) - if dx.startswith(prefix): - return 2 - for i in range(0, 7): - prefix = str(460 + i) - if dx.startswith(prefix): - return 3 - for prefix in ["07999", "3829", "7806", "7862"]: - if dx.startswith(prefix): - return 3 - elif (dx[0].isalpha() and dx[1:].isnumeric()): - for prefix in ["J09", "J10", "J11"]: - if dx.startswith(prefix): - return 1 - for i in range(12, 19): - prefix = "J{}".format(i) - if dx.startswith(prefix): - return 2 - for i in range(0, 7): - prefix = "J0{}".format(i) - if dx.startswith(prefix): - return 3 - for i in range(20, 23): - prefix = "J{}".format(i) - if dx.startswith(prefix): - return 3 - for prefix in ["J40", "R05", "H669", "R509", "B9789"]: - if dx.startswith(prefix): - return 3 - else: - return None - -def aggregate_data(sourcefile, targetfile): - reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) - # map column names to column indices - col_2_idx = {column.name.decode('utf-8'): column.col_id for column in reader.columns} - - def get_field(row, column): - return row[col_2_idx[column]] - - def row2flu(row): - for i in range(1, 9): - dx = get_field(row, "dx{}".format(i)) - flu_cat = get_flu_cat(dx) - if flu_cat is not None: - return flu_cat - return 0 - - def row2epiweek(row): - date = get_field(row, 'd_event') - year, month, day = date.year, date.month, date.day - week_tuple = epi.Week.fromdate(year, month, day).weektuple() - year, week_num = week_tuple[0], week_tuple[1] - return year, week_num - - results_dict = {} - for _, row in enumerate(reader): - # if (r >= 1000000): break - if get_field(row, 'type') != "Outpt": - continue - year, week_num = row2epiweek(row) - dmisid = get_field(row, 'DMISID') - flu_cat = row2flu(row) - - key_list = [year, week_num, dmisid, flu_cat] - curr_dict = results_dict - for i, key in enumerate(key_list): - if i == len(key_list) - 1: - if key not in curr_dict: - curr_dict[key] = 0 - curr_dict[key] += 1 - else: - if key not in curr_dict: - curr_dict[key] = {} - curr_dict = curr_dict[key] - - results_path = os.path.join(TARGET_DIR, targetfile) - with open(results_path, 'wb') as f: - pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) - - -################# Functions for geographical information #################### - -def get_country_mapping(): - filename = "country_codes.csv" - mapping = dict() - with open(os.path.join(TARGET_DIR, filename), "r") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - print(row.keys()) - alpha2 = row['alpha-2'] - alpha3 = row['alpha-3'] - mapping[alpha2] = alpha3 - - return mapping - -def format_dmisid_csv(filename, target_name): - src_path = os.path.join(TARGET_DIR, "{}.csv".format(filename)) - dst_path = os.path.join(TARGET_DIR, target_name) - - src_csv = open(src_path, "r", encoding='utf-8-sig') - reader = csv.DictReader(src_csv) - - dst_csv = open(dst_path, "w") - fieldnames = ['dmisid', 'country', 'state', 'zip5'] - writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) - writer.writeheader() - - country_mapping = get_country_mapping() - - for row in reader: - country2 = row['Facility ISO Country Code'] - if country2 == "": - country3 = "" - elif country2 not in country_mapping: - for key in row.keys(): - print(key, row[key]) - continue - else: - country3 = country_mapping[country2] - new_row = {'dmisid': row['DMIS ID'], - 'country': country3, - 'state': row['Facility State Code'], - 'zip5': row['Facility 5-Digit ZIP Code']} - writer.writerow(new_row) - -def dmisid(): - filename = 'DMISID_FY2018' - target_name = "simple_DMISID_FY2018.csv" - format_dmisid_csv(filename, target_name) - - -cen2states = {'cen1': {'CT', 'ME', 'MA', 'NH', 'RI', 'VT'}, - 'cen2': {'NJ', 'NY', 'PA'}, - 'cen3': {'IL', 'IN', 'MI', 'OH', 'WI'}, - 'cen4': {'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD'}, - 'cen5': {'DE', 'DC', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV'}, - 'cen6': {'AL', 'KY', 'MS', 'TN'}, - 'cen7': {'AR', 'LA', 'OK', 'TX'}, - 'cen8': {'AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY'}, - 'cen9': {'AK', 'CA', 'HI', 'OR', 'WA'}} - -hhs2states = {'hhs1': {'VT', 'CT', 'ME', 'MA', 'NH', 'RI'}, - 'hhs2': {'NJ', 'NY'}, - 'hhs3': {'DE', 'DC', 'MD', 'PA', 'VA', 'WV'}, - 'hhs4': {'AL', 'FL', 'GA', 'KY', 'MS', 'NC', 'TN', 'SC'}, - 'hhs5': {'IL', 'IN', 'MI', 'MN', 'OH', 'WI'}, - 'hhs6': {'AR', 'LA', 'NM', 'OK', 'TX'}, - 'hhs7': {'IA', 'KS', 'MO', 'NE'}, - 'hhs8': {'CO', 'MT', 'ND', 'SD', 'UT', 'WY'}, - 'hhs9': {'AZ', 'CA', 'HI', 'NV'}, - 'hhs10': {'AK', 'ID', 'OR', 'WA'}} - -def state2region(D): - results = dict() - for region in D.keys(): - states = D[region] - for state in states: - assert state not in results - results[state] = region - return results - -def state2region_csv(): - to_hhs = state2region(hhs2states) - to_cen = state2region(cen2states) - states = to_hhs.keys() - target_name = "state2region.csv" - fieldnames = ['state', 'hhs', 'cen'] - with open(target_name, "w") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - for state in states: - content = {"state": state, "hhs": to_hhs[state], "cen": to_cen[state]} - writer.writerow(content) - -################# Functions for geographical information #################### - -######################### Functions for AFHSB data ########################## - -def write_afhsb_csv(period): - flu_mapping = {0: "ili-flu3", 1: "flu1", 2:"flu2-flu1", 3: "flu3-flu2"} - results_dict = pickle.load(open(os.path.join(TARGET_DIR, "{}.pickle".format(period)), 'rb')) - - fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] - with open(os.path.join(TARGET_DIR, "{}.csv".format(period)), 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - - i = 0 - for year in sorted(results_dict.keys()): - year_dict = results_dict[year] - for week in sorted(year_dict.keys()): - week_dict = year_dict[week] - for dmisid in sorted(week_dict.keys()): - dmisid_dict = week_dict[dmisid] - for flu in sorted(dmisid_dict.keys()): - visit_sum = dmisid_dict[flu] - i += 1 - epiweek = int("{}{:02d}".format(year, week)) - flu_type = flu_mapping[flu] - - row = {"epiweek": epiweek, "dmisid": None if (not dmisid.isnumeric()) else dmisid, - "flu_type": flu_type, "visit_sum": visit_sum, "id": i} - writer.writerow(row) - if i % 100000 == 0: - print(row) - -def dmisid_start_time_from_file(filename): - starttime_record = dict() - with open(filename, 'r') as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - dmisid = row['dmisid'] - epiweek = int(row['epiweek']) - if dmisid not in starttime_record: - starttime_record[dmisid] = epiweek - else: - starttime_record[dmisid] = min(epiweek, starttime_record[dmisid]) - return starttime_record - -def dmisid_start_time(): - record1 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "00to13.csv")) - record2 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "13to17.csv")) - record = record1 - for dmisid, epiweek in record2.items(): - if dmisid in record: - record[dmisid] = min(record[dmisid], epiweek) - else: - record[dmisid] = epiweek - return record - -def fillin_zero_to_csv(period, dmisid_start_record): - src_path = os.path.join(TARGET_DIR, "{}.csv".format(period)) - dst_path = os.path.join(TARGET_DIR, "filled_{}.csv".format(period)) - - # Load data into a dictionary - src_csv = open(src_path, "r") - reader = csv.DictReader(src_csv) - - results_dict = dict() # epiweek -> dmisid -> flu_type: visit_sum - for i, row in enumerate(reader): - epiweek = int(row['epiweek']) - dmisid = row['dmisid'] - flu_type = row['flu_type'] - visit_sum = row['visit_sum'] - if epiweek not in results_dict: - results_dict[epiweek] = dict() - week_dict = results_dict[epiweek] - if dmisid not in week_dict: - week_dict[dmisid] = dict() - dmisid_dict = week_dict[dmisid] - dmisid_dict[flu_type] = visit_sum - - # Fill in zero count records - dmisid_group = dmisid_start_record.keys() - flutype_group = ["ili-flu3", "flu1", "flu2-flu1", "flu3-flu2"] - - for epiweek in results_dict.keys(): - week_dict = results_dict[epiweek] - for dmisid in dmisid_group: - start_week = dmisid_start_record[dmisid] - if start_week > epiweek: - continue - - if dmisid not in week_dict: - week_dict[dmisid] = dict() - - dmisid_dict = week_dict[dmisid] - for flutype in flutype_group: - if flutype not in dmisid_dict: - dmisid_dict[flutype] = 0 - - # Write to csv files - dst_csv = open(dst_path, "w") - fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] - writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) - writer.writeheader() - - i = 1 - for epiweek in results_dict: - for dmisid in results_dict[epiweek]: - for flutype in results_dict[epiweek][dmisid]: - visit_sum = results_dict[epiweek][dmisid][flutype] - row = {"id": i, "epiweek": epiweek, "dmisid": dmisid, - "flu_type": flutype, "visit_sum": visit_sum} - writer.writerow(row) - if i % 100000 == 0: - print(row) - i += 1 - print("Wrote {} rows".format(i)) - -######################### Functions for AFHSB data ########################## - -def main(): - # Build tables containing geographical information - state2region_csv() - dmisid() - - # Aggregate raw data into pickle files - aggregate_data("ili_1_2000_5_2013_new.sas7bdat", "00to13.pickle") - aggregate_data("ili_1_2013_11_2017_new.sas7bdat", "13to17.pickle") - - # write pickle content to csv files - write_afhsb_csv("00to13") - write_afhsb_csv("13to17") - - # Fill in zero count records - dmisid_start_record = dmisid_start_time() - fillin_zero_to_csv("00to13", dmisid_start_record) - fillin_zero_to_csv("13to17", dmisid_start_record) - - -if __name__ == '__main__': - main() diff --git a/src/acquisition/afhsb/afhsb_sql.py b/src/acquisition/afhsb/afhsb_sql.py deleted file mode 100644 index 278f3fc38..000000000 --- a/src/acquisition/afhsb/afhsb_sql.py +++ /dev/null @@ -1,194 +0,0 @@ -# standard library -import os - -# third party -import mysql.connector as connector - -# first party -import delphi.operations.secrets as secrets - - -def init_dmisid_table(sourcefile): - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - table_name = 'dmisid_table' - create_table_cmd = ''' - CREATE TABLE `{}` ( - `dmisid` INT(4) NOT NULL PRIMARY KEY, - `country` CHAR(3) NULL, - `state` CHAR(2) NULL - ); - '''.format(table_name) - populate_table_cmd = ''' - LOAD DATA INFILE '{}' - INTO TABLE {} - FIELDS TERMINATED BY ',' - ENCLOSED BY '"' - LINES TERMINATED BY '\r\n' - IGNORE 1 ROWS - (@dmisid, @country, @state, @zip5) - SET - dmisid = @dmisid, - country = nullif(@country, ''), - state = nullif(@state, '') - ; - '''.format(sourcefile, table_name) - try: - cursor = cnx.cursor() - cursor.execute(create_table_cmd) - cursor.execute(populate_table_cmd) - cnx.commit() - finally: - cnx.close() - -def init_region_table(sourcefile): - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - table_name = 'state2region_table' - create_table_cmd = ''' - CREATE TABLE `{}` ( - `state` CHAR(2) NOT NULL PRIMARY KEY, - `hhs` CHAR(5) NOT NULL, - `cen` CHAR(4) NOT NULL - ); - '''.format(table_name) - populate_table_cmd = ''' - LOAD DATA INFILE '{}' - INTO TABLE {} - FIELDS TERMINATED BY ',' - ENCLOSED BY '"' - LINES TERMINATED BY '\r\n' - IGNORE 1 ROWS - (@state, @hhs, @cen) - SET state=@state, hhs=@hhs, cen=@cen; - '''.format(sourcefile, table_name) - try: - cursor = cnx.cursor() - cursor.execute(create_table_cmd) - cursor.execute(populate_table_cmd) - cnx.commit() - finally: - cnx.close() - - -def init_raw_data(table_name, sourcefile): - print("Initialize {}".format(table_name)) - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - create_table_cmd = ''' - CREATE TABLE IF NOT EXISTS `{}` ( - `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, - `epiweek` INT(6) NOT NULL, - `dmisid` CHAR(4) NULL, - `flu_type` CHAR(9) NOT NULL, - `visit_sum` INT(11) NOT NULL, - - KEY `epiweek` (`epiweek`), - KEY `dmisid` (`dmisid`), - KEY `flu_type` (`flu_type`) - ); - '''.format(table_name) - populate_table_cmd = ''' - LOAD DATA INFILE '{}' - INTO TABLE {} - FIELDS TERMINATED BY ',' - ENCLOSED BY '"' - LINES TERMINATED BY '\r\n' - IGNORE 1 ROWS - (@id, @epiweek, @dmisid, @flu, @visits) - SET - id = @id, - epiweek = @epiweek, - dmisid = nullif(@dmisid, 'ZZZZ'), - flu_type = @flu, - visit_sum = @visits - ; - '''.format(sourcefile, table_name) - try: - cursor = cnx.cursor() - cursor.execute(create_table_cmd) - cursor.execute(populate_table_cmd) - cnx.commit() - finally: - cnx.close() - -def agg_by_state(src_table, dest_table): - print("Aggregating records by states...") - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - cmd = ''' - CREATE TABLE {} - SELECT a.epiweek, a.flu_type, d.state, d.country, sum(a.visit_sum) visit_sum - FROM {} a - LEFT JOIN dmisid_table d - ON a.dmisid = d.dmisid - GROUP BY a.epiweek, a.flu_type, d.state, d.country; - '''.format(dest_table, src_table) - try: - cursor = cnx.cursor() - cursor.execute(cmd) - cnx.commit() - finally: - cnx.close() - -def agg_by_region(src_table, dest_table): - print("Aggregating records by regions...") - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - cmd = ''' - CREATE TABLE {} - SELECT s.epiweek, s.flu_type, r.hhs, r.cen, sum(s.visit_sum) visit_sum - FROM {} s - LEFT JOIN state2region_table r - ON s.state = r.state - GROUP BY s.epiweek, s.flu_type, r.hhs, r.cen; - '''.format(dest_table, src_table) - try: - cursor = cnx.cursor() - cursor.execute(cmd) - cnx.commit() - finally: - cnx.close() - -def init_all_tables(datapath): - init_dmisid_table(os.path.join(datapath, "simple_DMISID_FY2018.csv")) - init_region_table(os.path.join(datapath, "state2region.csv")) - - periods = ["00to13", "13to17"] - for period in periods: - raw_table_name = 'afhsb_{}_raw'.format(period) - state_table_name = 'afhsb_{}_state'.format(period) - region_table_name = 'afhsb_{}_region'.format(period) - - init_raw_data(raw_table_name, os.path.join(datapath, "filled_{}.csv".format(period))) - agg_by_state(raw_table_name, state_table_name) - agg_by_region(state_table_name, region_table_name) - -def dangerously_drop_all_afhsb_tables(): - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - try: - cursor = cnx.cursor() - cursor.execute(''' - DROP TABLE IF EXISTS `afhsb_00to13_raw`, - `afhsb_00to13_region`, - `afhsb_00to13_state`, - `afhsb_13to17_raw`, - `afhsb_13to17_region`, - `afhsb_13to17_state`, - `state2region_table`, - `dmisid_table`; - ''') - cnx.commit() # (might do nothing; each DROP commits itself anyway) - finally: - cnx.close() - -def run_cmd(cmd): - (u, p) = secrets.db.epi - cnx = connector.connect(user=u, passwd=p, database="epidata") - try: - cursor = cnx.cursor() - cursor.execute(cmd) - cnx.commit() - finally: - cnx.close() diff --git a/src/acquisition/afhsb/afhsb_update.py b/src/acquisition/afhsb/afhsb_update.py deleted file mode 100644 index c5a8635c8..000000000 --- a/src/acquisition/afhsb/afhsb_update.py +++ /dev/null @@ -1,39 +0,0 @@ -# standard library -import argparse -import tempfile -import os -import stat -import shutil - -# first party -from . import afhsb_sql - -DEFAULT_DATAPATH = '/home/automation/afhsb_data' - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--datapath', action='store', type=str, default=DEFAULT_DATAPATH, help='filepath to directory containing csv files to input into database') - args = parser.parse_args() - # MariaDB appears to refuse to LOAD DATA INFILE except on files under - # /var/lib/mysql (which seems dedicated to its own files) or /tmp; create a - # temporary directory, make rwx for automation & rx for mysql user, copy in - # (or alternatively, symlink --- unimplemented) args.datapath to the - # temporary directory, then run init_all_tables on this temporary datapath. - # Set up temporary directory that will hold temporary datapath (initial - # permissions are very restrictive): - tmp_datapath_parent_dir = tempfile.mkdtemp() - os.chmod(tmp_datapath_parent_dir, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP) - shutil.chown(tmp_datapath_parent_dir, group="mysql_automation") - # (here, mysql_automation is a group with members {mysql,automation}) - tmp_datapath = os.path.join(tmp_datapath_parent_dir, "afhsb_data") - # Copy datapath to temporary datapath (initial permission of copy are - # permissive, but require directory access, which was set appropriately - # above): - shutil.copytree(args.datapath, tmp_datapath) - # Run init_all_tables on temporary datapath: - afhsb_sql.init_all_tables(tmp_datapath) - # (Temporary parent directory should be deleted automatically.) - - -if __name__ == '__main__': - main() From cdf3832cec31d3f96d0b00f62b48f9f5c171d55c Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 14:55:24 -0700 Subject: [PATCH 06/43] feat(afhsb): remove afhsb from Epidata Python client --- src/client/delphi_epidata.py | 56 ------------------------------------ 1 file changed, 56 deletions(-) diff --git a/src/client/delphi_epidata.py b/src/client/delphi_epidata.py index a56527357..654eba74c 100644 --- a/src/client/delphi_epidata.py +++ b/src/client/delphi_epidata.py @@ -394,62 +394,6 @@ def meta_norostat(auth): # Make the API call return Epidata._request(params) - # Fetch AFHSB data - @staticmethod - def afhsb(auth, locations, epiweeks, flu_types): - """Fetch AFHSB data (point data, no min/max).""" - # Check parameters - if auth is None or locations is None or epiweeks is None or flu_types is None: - raise Exception('`auth`, `locations`, `epiweeks` and `flu_types` are all required') - - loc_exception = 'Location parameter `{}` is invalid. Valid `location` parameters are: '\ - '`hhs[1-10]`, `cen[1-9]`, 2-letter state code or 3-letter country code.' - for location in locations: - location = location.lower() - if (location.startswith('hhs') or location.startswith('cen')): - prefix, postfix = location[:3], location[3:] - if (postfix.isnumeric()): - region_num = int(postfix) - if (region_num < 1 or region_num > 10 or (region_num == 10 and prefix == 'cen')): - raise Exception(loc_exception.format(location)) - else: - raise Exception(loc_exception.format(location)) - elif (len(location) < 2 or len(location) > 3): - raise Exception(loc_exception.format(location)) - - flu_exception = 'Flu-type parameters `{}` is invalid. Valid flu-type parameters are: '\ - '`flu1`, `flu2`, `flu3`, `ili`, `flu2-flu1`, `flu3-flu2`, `ili-flu3`.' - valid_flu_types = ['flu1', 'flu2', 'flu3', 'ili', 'flu2-flu1', 'flu3-flu2', 'ili-flu3'] - for flu_type in flu_types: - if (not flu_type in valid_flu_types): - raise Exception(flu_exception.format(flu_type)) - - # Set up request - params = { - 'endpoint': 'afhsb', - 'auth': auth, - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - 'flu_types': Epidata._list(flu_types) - } - # Make the API call - return Epidata._request(params) - - # Fetch AFHSB metadata - @staticmethod - def meta_afhsb(auth): - """Fetch AFHSB metadata.""" - # Check parameters - if auth is None: - raise Exception('`auth` is required') - # Set up request - params = { - 'endpoint': 'meta_afhsb', - 'auth': auth, - } - # Make the API call - return Epidata._request(params) - # Fetch NIDSS flu data @staticmethod def nidss_flu(regions, epiweeks, issues=None, lag=None): From 7a8ee39eb0c0fd1daefaac7c9c4f34ff534eebe6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 14:55:30 -0700 Subject: [PATCH 07/43] feat(afhsb): remove afhsb from Epidata R client --- src/client/delphi_epidata.R | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/src/client/delphi_epidata.R b/src/client/delphi_epidata.R index 06e9c2209..627948cc2 100644 --- a/src/client/delphi_epidata.R +++ b/src/client/delphi_epidata.R @@ -371,39 +371,6 @@ Epidata <- (function() { return(.request(params)) } - # Fetch AFHSB data (point data, no min/max) - afhsb <- function(auth, locations, epiweeks, flu_types) { - # Check parameters - if(missing(auth) || missing(locations) || missing(epiweeks) || missing(flu_types)) { - stop('`auth`, `locations`, `epiweeks` and `flu_types` are all required') - } - # Set up request - params <- list( - endpoint = 'afhsb', - auth = auth, - locations = .list(locations), - epiweeks = .list(epiweeks), - flu_types = .list(flu_types) - ) - # Make the API call - return(.request(params)) - } - - # Fetch AFHSB metadata - meta_afhsb <- function(auth) { - # Check parameters - if(missing(auth)) { - stop('`auth` is required') - } - # Set up request - params <- list( - endpoint = 'meta_afhsb', - auth = auth - ) - # Make the API call - return(.request(params)) - } - # Fetch NIDSS flu data nidss.flu <- function(regions, epiweeks, issues, lag) { # Check parameters @@ -662,8 +629,6 @@ Epidata <- (function() { quidel = quidel, norostat = norostat, meta_norostat = meta_norostat, - afhsb = afhsb, - meta_afhsb = meta_afhsb, nidss.flu = nidss.flu, nidss.dengue = nidss.dengue, delphi = delphi, From 9ce66351aa811045e1ca38037ab6d669c0bcb3bb Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 14:55:53 -0700 Subject: [PATCH 08/43] feat(afhsb): remove afhsb endpoint from server --- src/server/endpoints/__init__.py | 4 - src/server/endpoints/afhsb.py | 114 ----------------------------- src/server/endpoints/meta_afhsb.py | 31 -------- 3 files changed, 149 deletions(-) delete mode 100644 src/server/endpoints/afhsb.py delete mode 100644 src/server/endpoints/meta_afhsb.py diff --git a/src/server/endpoints/__init__.py b/src/server/endpoints/__init__.py index b58692676..94f1de5b8 100644 --- a/src/server/endpoints/__init__.py +++ b/src/server/endpoints/__init__.py @@ -1,5 +1,4 @@ from . import ( - afhsb, cdc, covid_hosp_facility_lookup, covid_hosp_facility, @@ -19,7 +18,6 @@ ght, ilinet, kcdc_ili, - meta_afhsb, meta_norostat, meta, nidss_dengue, @@ -36,7 +34,6 @@ ) endpoints = [ - afhsb, cdc, covid_hosp_facility_lookup, covid_hosp_facility, @@ -56,7 +53,6 @@ ght, ilinet, kcdc_ili, - meta_afhsb, meta_norostat, meta, nidss_dengue, diff --git a/src/server/endpoints/afhsb.py b/src/server/endpoints/afhsb.py deleted file mode 100644 index a006defac..000000000 --- a/src/server/endpoints/afhsb.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import Dict, List - -from flask import Blueprint, request - -from .._params import extract_integers, extract_strings -from .._query import execute_queries, filter_integers, filter_strings -from .._validate import require_all -from .._security import require_role - -# first argument is the endpoint name -bp = Blueprint("afhsb", __name__) -alias = None - - -def _split_locations(locations: List[str]): - # split locations into national/regional/state - location_dict: Dict[str, List[str]] = { - "hhs": [], - "cen": [], - "state": [], - "country": [], - } - for location in locations: - location = location.lower() - if location[0:3] == "hhs": - location_dict["hhs"].append(location) - elif location[0:3] == "cen": - location_dict["cen"].append(location) - elif len(location) == 3: - location_dict["country"].append(location) - elif len(location) == 2: - location_dict["state"].append(location) - return location_dict - - -def _split_flu_types(flu_types: List[str]): - # split flu types into disjoint/subset - disjoint_flus = [] - subset_flus = [] - for flu_type in flu_types: - if flu_type in ["flu1", "flu2-flu1", "flu3-flu2", "ili-flu3"]: - disjoint_flus.append(flu_type) - elif flu_type in ["flu2", "flu3", "ili"]: - subset_flus.append(flu_type) - return disjoint_flus, subset_flus - - -FLU_MAPPING = { - "flu2": ["flu1", "flu2-flu1"], - "flu3": ["flu1", "flu2-flu1", "flu3-flu2"], - "ili": ["flu1", "flu2-flu1", "flu3-flu2", "ili-flu3"], -} - - -@bp.route("/", methods=("GET", "POST")) -@require_role("afhsb") -def handle(): - require_all(request, "locations", "epiweeks", "flu_types") - - locations = extract_strings("locations") - epiweeks = extract_integers("epiweeks") - flu_types = extract_strings("flu_types") - - disjoint_flus, subset_flus = _split_flu_types(flu_types) - location_dict = _split_locations(locations) - - # build query - - queries = [] - for location_type, loc in location_dict.items(): - if not loc: - continue - table = ( - "afhsb_00to13_region" - if location_type in ["hhs", "cen"] - else "afhsb_00to13_state" - ) - fields = ( - f"`epiweek`, `{location_type}` `location`, sum(`visit_sum`) `visit_sum`" - ) - group = "`epiweek`, `location`" - order = "`epiweek` ASC, `location` ASC" - # build the filter - params = dict() - # build the epiweek filter - condition_epiweek = filter_integers("nd.`epiweek`", epiweeks, "epiweek", params) - condition_location = filter_strings(location_type, locations, "loc", params) - - for subset_flu in subset_flus: - flu_params = params.copy() - condition_flu = filter_strings( - "`flu_type`", FLU_MAPPING[subset_flu], "flu_type", flu_params - ) - query = f"""SELECT {fields}, '{subset_flu}' `flu_type` FROM {table} - WHERE ({condition_epiweek}) AND ({condition_location}) AND ({condition_flu}) - GROUP BY {group} ORDER BY {order}""" - queries.append((query, flu_params)) - # disjoint flu types: flu1, flu2-flu1, flu3-flu2, ili-flu3 - if disjoint_flus: - flu_params = params.copy() - condition_flu = filter_strings( - "`flu_type`", disjoint_flus, "flu_type", flu_params - ) - query = f"""SELECT {fields}, `flu_type` FROM {table} - WHERE ({condition_epiweek}) AND ({condition_location}) AND ({condition_flu}) - GROUP BY {group},`flu_type` ORDER BY {order},`flu_type`""" - queries.append((query, flu_params)) - - fields_string = ["location", "flu_type"] - fields_int = ["epiweek", "visit_sum"] - fields_float = [] - - # send query - return execute_queries(queries, fields_string, fields_int, fields_float) diff --git a/src/server/endpoints/meta_afhsb.py b/src/server/endpoints/meta_afhsb.py deleted file mode 100644 index 096ab58ec..000000000 --- a/src/server/endpoints/meta_afhsb.py +++ /dev/null @@ -1,31 +0,0 @@ -from flask import Blueprint, request - -from .._printer import print_non_standard -from .._query import parse_result -from .._security import require_role - - -# first argument is the endpoint name -bp = Blueprint("meta_afhsb", __name__) -alias = None - - -@bp.route("/", methods=("GET", "POST")) -@require_role("afhsb") -def handle(): - # build query - table1 = "afhsb_00to13_state" - table2 = "afhsb_13to17_state" - - string_keys = ["state", "country"] - int_keys = ["flu_severity"] - data = dict() - - for key in string_keys: - query = f"SELECT DISTINCT `{key}` FROM (select `{key}` from `{table1}` union select `{key}` from `{table2}`) t" - data[key] = parse_result(query, {}, [key]) - for key in int_keys: - query = f"SELECT DISTINCT `{key}` FROM (select `{key}` from `{table1}` union select `{key}` from `{table2}`) t" - data[key] = parse_result(query, {}, [], [key]) - - return print_non_standard(request.values.get("format"), data) From 2878980e4b4810f5594b9bf7a0afb539089d67d9 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 18:33:33 -0700 Subject: [PATCH 09/43] feat(afhsb): remove afhsb from deploy.json --- deploy.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/deploy.json b/deploy.json index 59d141ba4..654d669cc 100644 --- a/deploy.json +++ b/deploy.json @@ -174,15 +174,6 @@ "add-header-comment": true }, - "// acquisition - afhsb", - { - "type": "move", - "src": "src/acquisition/afhsb/", - "dst": "[[package]]/acquisition/afhsb/", - "match": "^.*\\.(py)$", - "add-header-comment": true - }, - "// acquisition - covidcast", { "type": "move", From d554566cbb323a3ffced9690e5dabdff41c387d4 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 18:33:59 -0700 Subject: [PATCH 10/43] feat(afhsb): remove afhsb from setup.cfg --- dev/local/setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/local/setup.cfg b/dev/local/setup.cfg index 443359b25..d7383ade1 100644 --- a/dev/local/setup.cfg +++ b/dev/local/setup.cfg @@ -6,7 +6,6 @@ version = 4.1.3 packages = delphi.epidata delphi.epidata.acquisition - delphi.epidata.acquisition.afhsb delphi.epidata.acquisition.cdcp delphi.epidata.acquisition.covid_hosp delphi.epidata.acquisition.covid_hosp.common From 316ecd18f27776b465f738db66ba2c628cd2803d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 18:34:13 -0700 Subject: [PATCH 11/43] feat(afhsb): remove norostat from setup.cfg --- dev/local/setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/local/setup.cfg b/dev/local/setup.cfg index d7383ade1..69bc91778 100644 --- a/dev/local/setup.cfg +++ b/dev/local/setup.cfg @@ -20,7 +20,6 @@ packages = delphi.epidata.acquisition.ght delphi.epidata.acquisition.kcdc delphi.epidata.acquisition.nidss - delphi.epidata.acquisition.norostat delphi.epidata.acquisition.paho delphi.epidata.acquisition.quidel delphi.epidata.acquisition.twtr From 5a6cefc004d45a45935436c93a3ba4ed636787df Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 18:34:29 -0700 Subject: [PATCH 12/43] docs(afhsb): remove afhsb and its meta --- docs/api/afhsb.md | 52 ------------------------------------------ docs/api/meta_afhsb.md | 49 --------------------------------------- 2 files changed, 101 deletions(-) delete mode 100644 docs/api/afhsb.md delete mode 100644 docs/api/meta_afhsb.md diff --git a/docs/api/afhsb.md b/docs/api/afhsb.md deleted file mode 100644 index d53ad643e..000000000 --- a/docs/api/afhsb.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: AFHSB -parent: Other Endpoints (COVID-19 and Other Diseases) ---- - -# AFHSB - -This is the API documentation for accessing the AFHSB (`afhsb`) endpoint of -[Delphi](https://delphi.cmu.edu/)'s epidemiological data. - -General topics not specific to any particular endpoint are discussed in the -[API overview](README.md). Such topics include: -[contributing](README.md#contributing), [citing](README.md#citing), and -[data licensing](README.md#data-licensing). - -## AFHSB Data - -... - -# The API - -The base URL is: https://api.delphi.cmu.edu/epidata/afhsb/ - -See [this documentation](README.md) for details on specifying epiweeks, dates, and lists. - -## Parameters - -### Required - -| Parameter | Description | Type | -| --- | --- | --- | -| `auth` | password | string | -| `epiweeks` | epiweeks | `list` of epiweeks | -| `locations` | locations | `list` of [region](https://github.com/cmu-delphi/delphi-epidata/blob/main/labels/regions.txt), [state](https://github.com/cmu-delphi/delphi-epidata/blob/main/labels/states.txt), or 3-letter country code labels | -| `flu_types` | flu types | `list` of disjoint (`flu1`, `flu2-flu1`, `flu3-flu2`, `ili-flu3`) or subset (`flu2`, `flu3`, `ili`) flu type labels | - -## Response - -| Field | Description | Type | -|-----------|-----------------------------------------------------------------|------------------| -| `result` | result code: 1 = success, 2 = too many results, -2 = no results | integer | -| `epidata` | list of results | array of objects | -| ... | ... | ... | -| `message` | `success` or error message | string | - -# Example URLs - - - -# Code Samples - - diff --git a/docs/api/meta_afhsb.md b/docs/api/meta_afhsb.md deleted file mode 100644 index 6ba294772..000000000 --- a/docs/api/meta_afhsb.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -title: AFHSB Metadata -parent: Other Endpoints (COVID-19 and Other Diseases) ---- - -# AFHSB Metadata - -This is the documentation of the API for accessing the AFHSB Metadata (`meta_afhsb`) endpoint of -the [Delphi](https://delphi.cmu.edu/)'s epidemiological data. - -General topics not specific to any particular endpoint are discussed in the -[API overview](README.md). Such topics include: -[contributing](README.md#contributing), [citing](README.md#citing), and -[data licensing](README.md#data-licensing). - -## AFHSB Metadata - -... - -# The API - -The base URL is: https://api.delphi.cmu.edu/epidata/meta_afhsb/ - -See [this documentation](README.md) for details on specifying epiweeks, dates, and lists. - -## Parameters - -### Required - -| Parameter | Description | Type | -|-----------|-------------|--------| -| `auth` | password | string | - -## Response - -| Field | Description | Type | -|-----------|-----------------------------------------------------------------|------------------| -| `result` | result code: 1 = success, 2 = too many results, -2 = no results | integer | -| `epidata` | list of results | array of objects | -| ... | ... | ... | -| `message` | `success` or error message | string | - -# Example URLs - - - -# Code Samples - - From 7a2fc7b0c53d944c03ce4d2e878a216814e10a24 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 18:34:39 -0700 Subject: [PATCH 13/43] docs(afhsb): remove afhsb from README --- docs/api/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index dd8f98d5c..709d068e0 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -110,7 +110,6 @@ The parameters available for each source are documented in each linked source-sp | Endpoint | Name | Description | Restricted? | | --- | --- | --- | --- | -| [`afhsb`](ahfsb.md) | AFHSB | ... | yes | | [`cdc`](cdc.md) | CDC Page Hits | ... | yes | | [`delphi`](delphi.md) | Delphi's Forecast | ... | no | | [`ecdc_ili`](ecdc_ili.md) | ECDC ILI | ECDC ILI data from the ECDC website. | no | @@ -122,7 +121,6 @@ The parameters available for each source are documented in each linked source-sp | [`ght`](ght.md) | Google Health Trends | Estimate of influenza activity based on volume of certain search queries. | yes | | [`kcdc_ili`](kcdc_ili.md) | KCDC ILI | KCDC ILI data from KCDC website. | no | | [`meta`](meta.md) | API Metadata | Metadata for `fluview`, `twitter`, `wiki`, and `delphi`. | no | -| [`meta_afhsb`](meta_afhsb.md) | AFHSB Metadata | ... | yes | | [`nidss_flu`](nidss_flu.md) | NIDSS Flu | Outpatient ILI from Taiwan's National Infectious Disease Statistics System (NIDSS). | no | | [`nowcast`](nowcast.md) | ILI Nearby | A nowcast of U.S. national, regional, and state-level (weighted) percent ILI, available seven days (regionally) or five days (state-level) before the first ILINet report for the corresponding week. | no | | [`quidel`](quidel.md) | Quidel | Data provided by Quidel Corp., which contains flu lab test results. | yes | From 92b546f3129e028153ea3ce6be8d38a0a7f10b23 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 18:35:03 -0700 Subject: [PATCH 14/43] feat(afhsb): remove afhsb from the js client --- src/client/delphi_epidata.js | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/client/delphi_epidata.js b/src/client/delphi_epidata.js index cf06ae976..117fe8949 100644 --- a/src/client/delphi_epidata.js +++ b/src/client/delphi_epidata.js @@ -123,19 +123,6 @@ version: () => { return _request('version', {}).then((r) => Object.assign(r, {client_version})); }, - /** - * Fetch AFHSB data (point data, no min/max) - */ - afhsb: (auth, locations, epiweeks, flu_types) => { - requireAll({ auth, locations, epiweeks, flu_types }); - const params = { - auth, - locations: _list(locations), - epiweeks: _list(epiweeks), - flu_types: _list(flu_types), - }; - return _request("afhsb", params); - }, /** * Fetch CDC page hits */ @@ -387,16 +374,6 @@ }; return _request("kcdc_ili", params); }, - /** - * Fetch AFHSB metadata - */ - meta_afhsb: (auth) => { - requireAll({ auth }); - const params = { - auth, - }; - return _request("meta_afhsb", params); - }, /** * Fetch NoroSTAT metadata */ From 6fe6e7a7b3422e7a7579d95f104d602465e50d93 Mon Sep 17 00:00:00 2001 From: melange396 Date: Mon, 26 Jun 2023 11:56:57 -0400 Subject: [PATCH 15/43] use second db handle for only for user admin and writes (#1184) * change 'user_engine' to a 'WriteSession' instead, so the master db connection is used for writes [and associated admin session reads] only * eager-load roles, remove unnecessary methods, add @default_session, move session ctx mgrs to admin page * make sure sql statements and timing are logged for all engines, plus tag engines with id and log those too, and superfluous user method cleanup * sqlalchemy cleanup: removed superfluous bits, improved argument passing for engine creation * _assign_roles() does its own commit() and returns an instance of the newly updated User * raise Exception when trying to update non-existent User, return UserRole on creation. * use more appropriate reciever for static method call, and expand comment on static vs bound methods in User. --- src/server/_common.py | 9 +-- src/server/_db.py | 61 +++++++++++++++--- src/server/_security.py | 6 +- src/server/admin/models.py | 114 +++++++++++++++------------------- src/server/endpoints/admin.py | 99 +++++++++++++++-------------- src/server/main.py | 4 -- 6 files changed, 159 insertions(+), 134 deletions(-) diff --git a/src/server/_common.py b/src/server/_common.py index 56d4c38ec..f7c28c7ef 100644 --- a/src/server/_common.py +++ b/src/server/_common.py @@ -3,7 +3,7 @@ from flask import Flask, g, request from sqlalchemy import event -from sqlalchemy.engine import Connection +from sqlalchemy.engine import Connection, Engine from werkzeug.exceptions import Unauthorized from werkzeug.local import LocalProxy @@ -85,12 +85,12 @@ def log_info_with_request_and_response(message, response, **kwargs): **kwargs ) -@event.listens_for(engine, "before_cursor_execute") +@event.listens_for(Engine, "before_cursor_execute") def before_cursor_execute(conn, cursor, statement, parameters, context, executemany): context._query_start_time = time.time() -@event.listens_for(engine, "after_cursor_execute") +@event.listens_for(Engine, "after_cursor_execute") def after_cursor_execute(conn, cursor, statement, parameters, context, executemany): # this timing info may be suspect, at least in terms of dbms cpu time... # it is likely that it includes that time as well as any overhead that @@ -101,7 +101,8 @@ def after_cursor_execute(conn, cursor, statement, parameters, context, executema # Convert to milliseconds total_time *= 1000 get_structured_logger("server_api").info( - "Executed SQL", statement=statement, params=parameters, elapsed_time_ms=total_time + "Executed SQL", statement=statement, params=parameters, elapsed_time_ms=total_time, + engine_id=conn.get_execution_options().get('engine_id') ) diff --git a/src/server/_db.py b/src/server/_db.py index 53e632cdf..e65c885ff 100644 --- a/src/server/_db.py +++ b/src/server/_db.py @@ -1,4 +1,7 @@ -from sqlalchemy import create_engine, MetaData +import functools +from inspect import signature, Parameter + +from sqlalchemy import create_engine from sqlalchemy.engine import Engine from sqlalchemy.orm import sessionmaker @@ -9,15 +12,57 @@ # previously `_common` imported from `_security` which imported from `admin.models`, which imported (back again) from `_common` for database connection objects -engine: Engine = create_engine(SQLALCHEMY_DATABASE_URI, **SQLALCHEMY_ENGINE_OPTIONS) -if SQLALCHEMY_DATABASE_URI_PRIMARY: - user_engine: Engine = create_engine(SQLALCHEMY_DATABASE_URI_PRIMARY, **SQLALCHEMY_ENGINE_OPTIONS) -else: - user_engine: Engine = engine +# a decorator to automatically provide a sqlalchemy session by default, if an existing session is not explicitly +# specified to override it. it is preferred to use a single session for a sequence of operations logically grouped +# together, but this allows individual operations to be run by themselves without having to provide an +# already-established session. requires an argument to the wrapped function named 'session'. +# for instance: +# +# @default_session(WriteSession) +# def foo(session): +# pass +# +# # calling: +# foo() +# # is identical to: +# with WriteSession() as s: +# foo(s) +def default_session(sess): + def decorator__default_session(func): + # make sure `func` is compatible w/ this decorator + func_params = signature(func).parameters + if 'session' not in func_params or func_params['session'].kind == Parameter.POSITIONAL_ONLY: + raise Exception(f"@default_session(): function {func.__name__}() must accept an argument 'session' that can be specified by keyword.") + # save position of 'session' arg, to later check if its been passed in by position/order + sess_index = list(func_params).index('session') + + @functools.wraps(func) + def wrapper__default_session(*args, **kwargs): + if 'session' in kwargs or len(args) >= sess_index+1: + # 'session' has been specified by the caller, so we have nothing to do here. pass along all args unchanged. + return func(*args, **kwargs) + # otherwise, we will wrap this call with a context manager for the default session provider, and pass that session instance to the wrapped function. + with sess() as session: + return func(*args, **kwargs, session=session) -metadata = MetaData(bind=user_engine) + return wrapper__default_session -Session = sessionmaker(bind=user_engine) + return decorator__default_session +engine: Engine = create_engine(SQLALCHEMY_DATABASE_URI, **SQLALCHEMY_ENGINE_OPTIONS, execution_options={'engine_id': 'default'}) +Session = sessionmaker(bind=engine) + +if SQLALCHEMY_DATABASE_URI_PRIMARY and SQLALCHEMY_DATABASE_URI_PRIMARY != SQLALCHEMY_DATABASE_URI: + # if available, use the main/primary DB for write operations. DB replication processes should be in place to + # propagate any written changes to the regular (load balanced) replicas. + write_engine: Engine = create_engine(SQLALCHEMY_DATABASE_URI_PRIMARY, **SQLALCHEMY_ENGINE_OPTIONS, execution_options={'engine_id': 'write_engine'}) + WriteSession = sessionmaker(bind=write_engine) + # TODO: insert log statement acknowledging this second session handle is in use? +else: + write_engine: Engine = engine + WriteSession = Session +# NOTE: `WriteSession` could be called `AdminSession`, as its only (currently) used by the admin page, and the admin +# page is the only thing that should be writing to the db. its tempting to let the admin page read from the +# regular `Session` and write with `WriteSession`, but concurrency problems may arise from sync/replication lag. diff --git a/src/server/_security.py b/src/server/_security.py index 761d088c3..61e2608b2 100644 --- a/src/server/_security.py +++ b/src/server/_security.py @@ -16,7 +16,7 @@ TEMPORARY_API_KEY, URL_PREFIX, ) -from .admin.models import User, UserRole +from .admin.models import User API_KEY_HARD_WARNING = API_KEY_REQUIRED_STARTING_AT - timedelta(days=14) API_KEY_SOFT_WARNING = API_KEY_HARD_WARNING - timedelta(days=14) @@ -91,10 +91,6 @@ def _get_current_user(): current_user: User = cast(User, LocalProxy(_get_current_user)) -def register_user_role(role_name: str) -> None: - UserRole.create_role(role_name) - - def _is_public_route() -> bool: public_routes_list = ["lib", "admin", "version"] for route in public_routes_list: diff --git a/src/server/admin/models.py b/src/server/admin/models.py index 62cbc186d..f5c0d54ed 100644 --- a/src/server/admin/models.py +++ b/src/server/admin/models.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import relationship from copy import deepcopy -from .._db import Session +from .._db import Session, WriteSession, default_session from delphi.epidata.common.logger import get_structured_logger from typing import Set, Optional, List @@ -25,7 +25,7 @@ def _default_date_now(): class User(Base): __tablename__ = "api_user" id = Column(Integer, primary_key=True, autoincrement=True) - roles = relationship("UserRole", secondary=association_table) + roles = relationship("UserRole", secondary=association_table, lazy="joined") # last arg does an eager load of this property from foreign tables api_key = Column(String(50), unique=True, nullable=False) email = Column(String(320), unique=True, nullable=False) created = Column(Date, default=_default_date_now) @@ -35,97 +35,85 @@ def __init__(self, api_key: str, email: str = None) -> None: self.api_key = api_key self.email = email - @staticmethod - def list_users() -> List["User"]: - with Session() as session: - return session.query(User).all() - @property def as_dict(self): return { "id": self.id, "api_key": self.api_key, "email": self.email, - "roles": User.get_user_roles(self.id), + "roles": set(role.name for role in self.roles), "created": self.created, "last_time_used": self.last_time_used } - @staticmethod - def get_user_roles(user_id: int) -> Set[str]: - with Session() as session: - user = session.query(User).filter(User.id == user_id).first() - return set([role.name for role in user.roles]) - def has_role(self, required_role: str) -> bool: - return required_role in User.get_user_roles(self.id) + return required_role in set(role.name for role in self.roles) @staticmethod def _assign_roles(user: "User", roles: Optional[Set[str]], session) -> None: - # NOTE: this uses a borrowed/existing `session`, and thus does not do a `session.commit()`... - # that is the responsibility of the caller! get_structured_logger("api_user_models").info("setting roles", roles=roles, user_id=user.id, api_key=user.api_key) db_user = session.query(User).filter(User.id == user.id).first() # TODO: would it be sufficient to use the passed-in `user` instead of looking up this `db_user`? + # or even use this as a bound method instead of a static?? + # same goes for `update_user()` and `delete_user()` below... if roles: - roles_to_assign = session.query(UserRole).filter(UserRole.name.in_(roles)).all() - db_user.roles = roles_to_assign + db_user.roles = session.query(UserRole).filter(UserRole.name.in_(roles)).all() else: db_user.roles = [] + session.commit() + # retrieve the newly updated User object + return session.query(User).filter(User.id == user.id).first() @staticmethod + @default_session(Session) def find_user(*, # asterisk forces explicit naming of all arguments when calling this method - user_id: Optional[int] = None, api_key: Optional[str] = None, user_email: Optional[str] = None + session, + user_id: Optional[int] = None, api_key: Optional[str] = None, user_email: Optional[str] = None ) -> "User": # NOTE: be careful, using multiple arguments could match multiple users, but this will return only one! - with Session() as session: - user = ( - session.query(User) - .filter((User.id == user_id) | (User.api_key == api_key) | (User.email == user_email)) - .first() - ) + user = ( + session.query(User) + .filter((User.id == user_id) | (User.api_key == api_key) | (User.email == user_email)) + .first() + ) return user if user else None @staticmethod - def create_user(api_key: str, email: str, user_roles: Optional[Set[str]] = None) -> "User": + @default_session(WriteSession) + def create_user(api_key: str, email: str, session, user_roles: Optional[Set[str]] = None) -> "User": get_structured_logger("api_user_models").info("creating user", api_key=api_key) - with Session() as session: - new_user = User(api_key=api_key, email=email) - # TODO: we may need to populate 'created' field/column here, if the default - # specified above gets bound to the time of when that line of python was evaluated. - session.add(new_user) - session.commit() - User._assign_roles(new_user, user_roles, session) - session.commit() - return new_user + new_user = User(api_key=api_key, email=email) + session.add(new_user) + session.commit() + return User._assign_roles(new_user, user_roles, session) @staticmethod + @default_session(WriteSession) def update_user( user: "User", email: Optional[str], api_key: Optional[str], - roles: Optional[Set[str]] + roles: Optional[Set[str]], + session ) -> "User": get_structured_logger("api_user_models").info("updating user", user_id=user.id, new_api_key=api_key) - with Session() as session: - user = User.find_user(user_id=user.id) - if user: - update_stmt = ( - update(User) - .where(User.id == user.id) - .values(api_key=api_key, email=email) - ) - session.execute(update_stmt) - User._assign_roles(user, roles, session) - session.commit() - return user + user = User.find_user(user_id=user.id, session=session) + if not user: + raise Exception('user not found') + update_stmt = ( + update(User) + .where(User.id == user.id) + .values(api_key=api_key, email=email) + ) + session.execute(update_stmt) + return User._assign_roles(user, roles, session) @staticmethod - def delete_user(user_id: int) -> None: + @default_session(WriteSession) + def delete_user(user_id: int, session) -> None: get_structured_logger("api_user_models").info("deleting user", user_id=user_id) - with Session() as session: - session.execute(delete(User).where(User.id == user_id)) - session.commit() + session.execute(delete(User).where(User.id == user_id)) + session.commit() class UserRole(Base): @@ -134,23 +122,23 @@ class UserRole(Base): name = Column(String(50), unique=True) @staticmethod - def create_role(name: str) -> None: + @default_session(WriteSession) + def create_role(name: str, session) -> None: get_structured_logger("api_user_models").info("creating user role", role=name) - with Session() as session: - session.execute( - f""" + # TODO: check role doesnt already exist + session.execute(f""" INSERT INTO user_role (name) SELECT '{name}' WHERE NOT EXISTS (SELECT * FROM user_role WHERE name='{name}') - """ - ) - session.commit() + """) + session.commit() + return session.query(UserRole).filter(UserRole.name == name).first() @staticmethod - def list_all_roles(): - with Session() as session: - roles = session.query(UserRole).all() + @default_session(Session) + def list_all_roles(session): + roles = session.query(UserRole).all() return [role.name for role in roles] diff --git a/src/server/endpoints/admin.py b/src/server/endpoints/admin.py index 17bc9ca9b..a6f941b48 100644 --- a/src/server/endpoints/admin.py +++ b/src/server/endpoints/admin.py @@ -7,6 +7,7 @@ from .._common import log_info_with_request from .._config import ADMIN_PASSWORD, API_KEY_REGISTRATION_FORM_LINK, API_KEY_REMOVAL_REQUEST_LINK, REGISTER_WEBHOOK_TOKEN +from .._db import WriteSession from .._security import resolve_auth_token from ..admin.models import User, UserRole @@ -29,22 +30,13 @@ def _require_admin(): return token -def _parse_roles(roles: List[str]) -> Set[str]: - return set(roles) - - -def _render(mode: str, token: str, flags: Dict, **kwargs): +def _render(mode: str, token: str, flags: Dict, session, **kwargs): template = (templates_dir / "index.html").read_text("utf8") return render_template_string( - template, mode=mode, token=token, flags=flags, roles=UserRole.list_all_roles(), **kwargs + template, mode=mode, token=token, flags=flags, roles=UserRole.list_all_roles(session), **kwargs ) -def user_exists(user_email: str = None, api_key: str = None): - user = User.find_user(user_email=user_email, api_key=api_key) - return True if user else False - - # ~~~~ PUBLIC ROUTES ~~~~ @@ -67,44 +59,50 @@ def removal_request_redirect(): def _index(): token = _require_admin() flags = dict() - if request.method == "POST": - # register a new user - if not user_exists(user_email=request.values["email"], api_key=request.values["api_key"]): - User.create_user( - request.values["api_key"], - request.values["email"], - _parse_roles(request.values.getlist("roles")), - ) - flags["banner"] = "Successfully Added" - else: - flags["banner"] = "User with such email and/or api key already exists." - users = [user.as_dict for user in User.list_users()] - return _render("overview", token, flags, users=users, user=dict()) + with WriteSession() as session: + if request.method == "POST": + # register a new user + if not User.find_user( + user_email=request.values["email"], api_key=request.values["api_key"], + session=session): + User.create_user( + api_key=request.values["api_key"], + email=request.values["email"], + user_roles=set(request.values.getlist("roles")), + session=session + ) + flags["banner"] = "Successfully Added" + else: + flags["banner"] = "User with such email and/or api key already exists." + users = [user.as_dict for user in session.query(User).all()] + return _render("overview", token, flags, session=session, users=users, user=dict()) @bp.route("/", methods=["GET", "PUT", "POST", "DELETE"]) def _detail(user_id: int): token = _require_admin() - user = User.find_user(user_id=user_id) - if not user: - raise NotFound() - if request.method == "DELETE" or "delete" in request.values: - User.delete_user(user.id) - return redirect(f"./?auth={token}") - flags = dict() - if request.method in ["PUT", "POST"]: - user_check = User.find_user(api_key=request.values["api_key"], user_email=request.values["email"]) - if user_check and user_check.id != user.id: - flags["banner"] = "Could not update user; same api_key and/or email already exists." - else: - user = user.update_user( - user=user, - api_key=request.values["api_key"], - email=request.values["email"], - roles=_parse_roles(request.values.getlist("roles")), - ) - flags["banner"] = "Successfully Saved" - return _render("detail", token, flags, user=user.as_dict) + with WriteSession() as session: + user = User.find_user(user_id=user_id, session=session) + if not user: + raise NotFound() + if request.method == "DELETE" or "delete" in request.values: + User.delete_user(user.id, session=session) + return redirect(f"./?auth={token}") + flags = dict() + if request.method in ["PUT", "POST"]: + user_check = User.find_user(api_key=request.values["api_key"], user_email=request.values["email"], session=session) + if user_check and user_check.id != user.id: + flags["banner"] = "Could not update user; same api_key and/or email already exists." + else: + user = User.update_user( + user=user, + api_key=request.values["api_key"], + email=request.values["email"], + roles=set(request.values.getlist("roles")), + session=session + ) + flags["banner"] = "Successfully Saved" + return _render("detail", token, flags, session=session, user=user.as_dict) @bp.route("/register", methods=["POST"]) @@ -116,12 +114,13 @@ def _register(): user_api_key = body["user_api_key"] user_email = body["user_email"] - if user_exists(user_email=user_email, api_key=user_api_key): - return make_response( - "User with email and/or API Key already exists, use different parameters or contact us for help", - 409, - ) - User.create_user(api_key=user_api_key, email=user_email) + with WriteSession() as session: + if User.find_user(user_email=user_email, api_key=user_api_key, session=session): + return make_response( + "User with email and/or API Key already exists, use different parameters or contact us for help", + 409, + ) + User.create_user(api_key=user_api_key, email=user_email, session=session) return make_response(f"Successfully registered API key '{user_api_key}'", 200) diff --git a/src/server/main.py b/src/server/main.py index c05b9d0d3..a91a91ee2 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -8,11 +8,9 @@ from ._config import URL_PREFIX, VERSION from ._common import app, set_compatibility_mode -from ._db import metadata, engine from ._exceptions import MissingOrWrongSourceException from .endpoints import endpoints from .endpoints.admin import bp as admin_bp, enable_admin -from ._security import register_user_role from ._limiter import limiter, apply_limit __all__ = ["app"] @@ -65,8 +63,6 @@ def send_lib_file(path: str): return send_from_directory(pathlib.Path(__file__).parent / "lib", path) -metadata.create_all(engine) - if __name__ == "__main__": app.run(host="0.0.0.0", port=5000) else: From fa0bd53a095096e775d5545f4cb11a229a34a0cc Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 26 Jun 2023 09:40:51 -0700 Subject: [PATCH 16/43] feat(afhsb): remove afhsb from .ts file --- src/client/delphi_epidata.d.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/client/delphi_epidata.d.ts b/src/client/delphi_epidata.d.ts index f88b18247..0b81db779 100644 --- a/src/client/delphi_epidata.d.ts +++ b/src/client/delphi_epidata.d.ts @@ -20,7 +20,6 @@ declare module 'delphi_epidata' { client_version: string; version(): Promise<{version: string, client_version: string}>; - afhsb(callback: EpiDataCallback, auth: string, locations: StringParam, epiweeks: EpiRangeParam, flu_types: StringParam): Promise; cdc(callback: EpiDataCallback, auth: string, epiweeks: EpiRangeParam, locations: StringParam): Promise; covid_hosp_facility(callback: EpiDataCallback, hospital_pks: StringParam, collection_weeks: EpiRangeParam, publication_dates: EpiRangeParam): Promise; covid_hosp_facility_lookup(callback: EpiDataCallback, state?: string, ccn?: string, city?: string, zip?: string, fips_code?: string): Promise; @@ -37,7 +36,6 @@ declare module 'delphi_epidata' { gft(callback: EpiDataCallback, locations: StringParam, epiweeks: EpiRangeParam): Promise; ght(callback: EpiDataCallback, auth: string, locations: StringParam, epiweeks: EpiRangeParam, query: string): Promise; kcdc_ili(callback: EpiDataCallback, regions: StringParam, epiweeks: EpiRangeParam, issues?: EpiRangeParam, lag?: number): Promise; - meta_afhsb(callback: EpiDataCallback, auth: string): Promise; meta_norostat(callback: EpiDataCallback, auth: string): Promise; meta(callback: EpiDataCallback): Promise; nidss_dengue(callback: EpiDataCallback, locations: StringParam, epiweeks: EpiRangeParam): Promise; @@ -61,7 +59,6 @@ declare module 'delphi_epidata' { client_version: string; version(): Promise<{ version: string, client_version: string }>; - afhsb(auth: string, locations: StringParam, epiweeks: EpiRangeParam, flu_types: StringParam): Promise; cdc(auth: string, epiweeks: EpiRangeParam, locations: StringParam): Promise; covid_hosp_facility(hospital_pks: StringParam, collection_weeks: EpiRangeParam, publication_dates: EpiRangeParam): Promise; covid_hosp_facility_lookup(state?: string, ccn?: string, city?: string, zip?: string, fips_code?: string): Promise; @@ -78,7 +75,6 @@ declare module 'delphi_epidata' { gft(locations: StringParam, epiweeks: EpiRangeParam): Promise; ght(auth: string, locations: StringParam, epiweeks: EpiRangeParam, query: string): Promise; kcdc_ili(regions: StringParam, epiweeks: EpiRangeParam, issues?: EpiRangeParam, lag?: number): Promise; - meta_afhsb(auth: string): Promise; meta_norostat(auth: string): Promise; meta(callback: EpiDataCallback): Promise; nidss_dengue(locations: StringParam, epiweeks: EpiRangeParam): Promise; From 60cdbb1be5195e906e0bbac3a79c2890d5011b64 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:47:44 -0700 Subject: [PATCH 17/43] ci(black): set line-length 100 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d255c2849..d8589df09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.black] -line-length = 200 +line-length = 100 target-version = ['py38'] include = 'server,tests/server' From 980b0b7e80c7923b79e14fee620645e680785703 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:49:43 -0700 Subject: [PATCH 18/43] style(black): format cdc acquisition --- src/acquisition/cdcp/cdc_dropbox_receiver.py | 212 ++++++------ src/acquisition/cdcp/cdc_extract.py | 213 ++++++------ src/acquisition/cdcp/cdc_upload.py | 327 ++++++++++--------- 3 files changed, 379 insertions(+), 373 deletions(-) diff --git a/src/acquisition/cdcp/cdc_dropbox_receiver.py b/src/acquisition/cdcp/cdc_dropbox_receiver.py index eb0d97f2a..65626101b 100644 --- a/src/acquisition/cdcp/cdc_dropbox_receiver.py +++ b/src/acquisition/cdcp/cdc_dropbox_receiver.py @@ -29,128 +29,128 @@ # location constants -DROPBOX_BASE_DIR = '/cdc_page_stats' -DELPHI_BASE_DIR = '/common/cdc_stage' +DROPBOX_BASE_DIR = "/cdc_page_stats" +DELPHI_BASE_DIR = "/common/cdc_stage" def get_timestamp_string(): - """ - Return the current local date and time as a string. + """ + Return the current local date and time as a string. - The format is "%Y%m%d_%H%M%S". - """ - return datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + The format is "%Y%m%d_%H%M%S". + """ + return datetime.datetime.now().strftime("%Y%m%d_%H%M%S") def trigger_further_processing(): - """Add CDCP processing scripts to the Automation run queue.""" + """Add CDCP processing scripts to the Automation run queue.""" - # connect - u, p = secrets.db.auto - cnx = mysql.connector.connect(user=u, password=p, database='automation') - cur = cnx.cursor() + # connect + u, p = secrets.db.auto + cnx = mysql.connector.connect(user=u, password=p, database="automation") + cur = cnx.cursor() - # add step "Process CDCP Data" to queue - cur.execute('CALL automation.RunStep(46)') + # add step "Process CDCP Data" to queue + cur.execute("CALL automation.RunStep(46)") - # disconnect - cur.close() - cnx.commit() - cnx.close() + # disconnect + cur.close() + cnx.commit() + cnx.close() def fetch_data(): - """ - Check for new files on dropbox, download them, zip them, cleanup dropbox, and - trigger further processing of new data. - """ - - # initialize dropbox api - dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) - - # look for new CDC data files - print('checking dropbox:%s' % DROPBOX_BASE_DIR) - save_list = [] - for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: - name = entry.name - if name.endswith('.csv') or name.endswith('.zip'): - print(' download "%s"' % name) - save_list.append(name) - else: - print(' skip "%s"' % name) - - # determine if there's anything to be done - if len(save_list) == 0: - print('did not find any new data files') - return - - # download new files, saving them inside of a new zip file - timestamp = get_timestamp_string() - zip_path = '%s/dropbox_%s.zip' % (DELPHI_BASE_DIR, timestamp) - print('downloading into delphi:%s' % zip_path) - with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zf: + """ + Check for new files on dropbox, download them, zip them, cleanup dropbox, and + trigger further processing of new data. + """ + + # initialize dropbox api + dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) + + # look for new CDC data files + print(f"checking dropbox: {DROPBOX_BASE_DIR}") + save_list = [] + for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: + name = entry.name + if name.endswith(".csv") or name.endswith(".zip"): + print(f" download: {name}") + save_list.append(name) + else: + print(f" skip: {name}") + + # determine if there's anything to be done + if len(save_list) == 0: + print("did not find any new data files") + return + + # download new files, saving them inside of a new zip file + timestamp = get_timestamp_string() + zip_path = f"{DELPHI_BASE_DIR}/dropbox_{timestamp}.zip" + print(f"downloading into delphi:{zip_path}") + with ZipFile(zip_path, "w", ZIP_DEFLATED) as zf: + for name in save_list: + # location of the file on dropbox + dropbox_path = f"{DROPBOX_BASE_DIR}/{name}" + print(f" {dropbox_path}") + + # start the download + meta, resp = dbx.files_download(dropbox_path) + + # check status and length + if resp.status_code != 200: + raise Exception(["resp.status_code", resp.status_code]) + dropbox_len = meta.size + print(" need %d bytes..." % dropbox_len) + content_len = int(resp.headers.get("Content-Length", -1)) + if dropbox_len != content_len: + info = ["dropbox_len", dropbox_len, "content_len", content_len] + raise Exception(info) + + # finish the download, holding the data in this variable + filedata = resp.content + + # check the length again + payload_len = len(filedata) + print(" downloaded") + if dropbox_len != payload_len: + info = ["dropbox_len", dropbox_len, "payload_len", payload_len] + raise Exception(info) + + # add the downloaded file to the zip file + zf.writestr(name, filedata) + print(" added") + + # At this point, all the data is stored and awaiting further processing on + # the delphi server. + print(f"saved all new data in {zip_path}") + + # on dropbox, archive downloaded files so they won't be downloaded again + archive_dir = f"archived_reports/processed_{timestamp}" + print("archiving files...") for name in save_list: - # location of the file on dropbox - dropbox_path = '%s/%s' % (DROPBOX_BASE_DIR, name) - print(' %s' % dropbox_path) - - # start the download - meta, resp = dbx.files_download(dropbox_path) - - # check status and length - if resp.status_code != 200: - raise Exception(['resp.status_code', resp.status_code]) - dropbox_len = meta.size - print(' need %d bytes...' % dropbox_len) - content_len = int(resp.headers.get('Content-Length', -1)) - if dropbox_len != content_len: - info = ['dropbox_len', dropbox_len, 'content_len', content_len] - raise Exception(info) - - # finish the download, holding the data in this variable - filedata = resp.content - - # check the length again - payload_len = len(filedata) - print(' downloaded') - if dropbox_len != payload_len: - info = ['dropbox_len', dropbox_len, 'payload_len', payload_len] - raise Exception(info) - - # add the downloaded file to the zip file - zf.writestr(name, filedata) - print(' added') - - # At this point, all the data is stored and awaiting further processing on - # the delphi server. - print('saved all new data in %s' % zip_path) - - # on dropbox, archive downloaded files so they won't be downloaded again - archive_dir = 'archived_reports/processed_%s' % timestamp - print('archiving files...') - for name in save_list: - # source and destination - dropbox_src = '%s/%s' % (DROPBOX_BASE_DIR, name) - dropbox_dst = '%s/%s/%s' % (DROPBOX_BASE_DIR, archive_dir, name) - print(' "%s" -> "%s"' % (dropbox_src, dropbox_dst)) - - # move the file - meta = dbx.files_move(dropbox_src, dropbox_dst) - - # sanity check - if archive_dir not in meta.path_lower: - raise Exception('failed to move "%s"' % name) - - # finally, trigger the usual processing flow - print('triggering processing flow') - trigger_further_processing() - print('done') + # source and destination + dropbox_src = f"{DROPBOX_BASE_DIR}/{name}" + dropbox_dst = f"{DROPBOX_BASE_DIR}/{archive_dir}/{name}" + print(f" {dropbox_src} -> {dropbox_dst}") + + # move the file + meta = dbx.files_move(dropbox_src, dropbox_dst) + + # sanity check + if archive_dir not in meta.path_lower: + raise Exception(f"failed to move {name}") + + # finally, trigger the usual processing flow + print("triggering processing flow") + trigger_further_processing() + print("done") def main(): - # fetch new data - fetch_data() + # fetch new data + fetch_data() -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index 83ed08d5b..e4d7af573 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -75,7 +75,7 @@ def get_num_hits(cur, epiweek, state, page): - sql = ''' + sql = """ SELECT sum(c.`num`) `num` FROM @@ -86,36 +86,36 @@ def get_num_hits(cur, epiweek, state, page): m.`date` = c.`date` AND m.`state` = c.`state` WHERE m.`epiweek` = %s AND c.`state` = %s AND c.`page` LIKE %s - ''' - num = None - cur.execute(sql, (epiweek, state, page)) - for (num,) in cur: - pass - if num is None: - return 0 - return num + """ + num = None + cur.execute(sql, (epiweek, state, page)) + for (num,) in cur: + pass + if num is None: + return 0 + return num def get_total_hits(cur, epiweek, state): - sql = ''' + sql = """ SELECT sum(m.`total`) `total` FROM `cdc_meta` m WHERE m.`epiweek` = %s AND m.`state` = %s - ''' - total = None - cur.execute(sql, (epiweek, state)) - for (total,) in cur: - pass - if total is None: - raise Exception('missing data for %d-%s' % (epiweek, state)) - return total + """ + total = None + cur.execute(sql, (epiweek, state)) + for (total,) in cur: + pass + if total is None: + raise Exception("missing data for %d-%s" % (epiweek, state)) + return total def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total): - sql = ''' + sql = """ INSERT INTO `cdc_extract` (`epiweek`, `state`, `num1`, `num2`, `num3`, `num4`, `num5`, `num6`, `num7`, `num8`, `total`) VALUES @@ -130,94 +130,99 @@ def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, `num7` = %s, `num8` = %s, `total` = %s - ''' - values = [num1, num2, num3, num4, num5, num6, num7, num8, total] - args = tuple([epiweek, state] + values + values) - cur.execute(sql, args) + """ + values = [num1, num2, num3, num4, num5, num6, num7, num8, total] + args = tuple([epiweek, state] + values + values) + cur.execute(sql, args) def extract(first_week=None, last_week=None, test_mode=False): - # page title templates - pages = [ - '%What You Should Know for the % Influenza Season%', - '%What To Do If You Get Sick%', - '%Flu Symptoms & Severity%', - '%How Flu Spreads%', - '%What You Should Know About Flu Antiviral Drugs%', - '%Weekly US Map%', - '%Basics%', - '%Flu Activity & Surveillance%', - ] - - # location information - states = sorted(cdc_upload.STATES.values()) - - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # weeks to update - if first_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_extract`') - for (first_week,) in cur: - pass - if last_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_meta`') - for (last_week,) in cur: - pass - print('extracting %d--%d' % (first_week, last_week)) - - # update each epiweek - for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): - # update each state - for state in states: - try: - num1 = get_num_hits(cur, epiweek, state, pages[0]) - num2 = get_num_hits(cur, epiweek, state, pages[1]) - num3 = get_num_hits(cur, epiweek, state, pages[2]) - num4 = get_num_hits(cur, epiweek, state, pages[3]) - num5 = get_num_hits(cur, epiweek, state, pages[4]) - num6 = get_num_hits(cur, epiweek, state, pages[5]) - num7 = get_num_hits(cur, epiweek, state, pages[6]) - num8 = get_num_hits(cur, epiweek, state, pages[7]) - total = get_total_hits(cur, epiweek, state) - store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) - print(' %d-%s: %d %d %d %d %d %d %d %d (%d)' % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) - except Exception as ex: - print(' %d-%s: failed' % (epiweek, state), ex) - #raise ex - sys.stdout.flush() - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + # page title templates + pages = [ + "%What You Should Know for the % Influenza Season%", + "%What To Do If You Get Sick%", + "%Flu Symptoms & Severity%", + "%How Flu Spreads%", + "%What You Should Know About Flu Antiviral Drugs%", + "%Weekly US Map%", + "%Basics%", + "%Flu Activity & Surveillance%", + ] + + # location information + states = sorted(cdc_upload.STATES.values()) + + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # weeks to update + if first_week is None: + cur.execute("SELECT max(`epiweek`) FROM `cdc_extract`") + for (first_week,) in cur: + pass + if last_week is None: + cur.execute("SELECT max(`epiweek`) FROM `cdc_meta`") + for (last_week,) in cur: + pass + print("extracting %d--%d" % (first_week, last_week)) + + # update each epiweek + for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): + # update each state + for state in states: + try: + num1 = get_num_hits(cur, epiweek, state, pages[0]) + num2 = get_num_hits(cur, epiweek, state, pages[1]) + num3 = get_num_hits(cur, epiweek, state, pages[2]) + num4 = get_num_hits(cur, epiweek, state, pages[3]) + num5 = get_num_hits(cur, epiweek, state, pages[4]) + num6 = get_num_hits(cur, epiweek, state, pages[5]) + num7 = get_num_hits(cur, epiweek, state, pages[6]) + num8 = get_num_hits(cur, epiweek, state, pages[7]) + total = get_total_hits(cur, epiweek, state) + store_result( + cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total + ) + print( + " %d-%s: %d %d %d %d %d %d %d %d (%d)" + % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) + ) + except Exception as ex: + print(" %d-%s: failed" % (epiweek, state), ex) + # raise ex + sys.stdout.flush() + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--epiweek', '-w', default=None, type=int, help='epiweek override') - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() - - # sanity check - first, last, week = args.first, args.last, args.epiweek - for ew in [first, last, week]: - if ew is not None: - flu.check_epiweek(ew) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - if week is not None: - first = last = week - - # extract the page hits for all states on the specified weeks - extract(first, last, args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument("--epiweek", "-w", default=None, type=int, help="epiweek override") + parser.add_argument("--test", "-t", default=False, action="store_true", help="dry run only") + args = parser.parse_args() + + # sanity check + first, last, week = args.first, args.last, args.epiweek + for ew in [first, last, week]: + if ew is not None: + flu.check_epiweek(ew) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + if week is not None: + first = last = week + + # extract the page hits for all states on the specified weeks + extract(first, last, args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index c9c206dfa..fef0821b7 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -87,191 +87,192 @@ STATES = { - 'Alabama': 'AL', - 'Alaska': 'AK', - 'Arizona': 'AZ', - 'Arkansas': 'AR', - 'California': 'CA', - 'Colorado': 'CO', - 'Connecticut': 'CT', - 'Delaware': 'DE', - 'District of Columbia': 'DC', - 'Florida': 'FL', - 'Georgia': 'GA', - 'Hawaii': 'HI', - 'Idaho': 'ID', - 'Illinois': 'IL', - 'Indiana': 'IN', - 'Iowa': 'IA', - 'Kansas': 'KS', - 'Kentucky': 'KY', - 'Louisiana': 'LA', - 'Maine': 'ME', - 'Maryland': 'MD', - 'Massachusetts': 'MA', - 'Michigan': 'MI', - 'Minnesota': 'MN', - 'Mississippi': 'MS', - 'Missouri': 'MO', - 'Montana': 'MT', - 'Nebraska': 'NE', - 'Nevada': 'NV', - 'New Hampshire': 'NH', - 'New Jersey': 'NJ', - 'New Mexico': 'NM', - 'New York': 'NY', - 'North Carolina': 'NC', - 'North Dakota': 'ND', - 'Ohio': 'OH', - 'Oklahoma': 'OK', - 'Oregon': 'OR', - 'Pennsylvania': 'PA', - 'Rhode Island': 'RI', - 'South Carolina': 'SC', - 'South Dakota': 'SD', - 'Tennessee': 'TN', - 'Texas': 'TX', - 'Utah': 'UT', - 'Vermont': 'VT', - 'Virginia': 'VA', - 'Washington': 'WA', - 'West Virginia': 'WV', - 'Wisconsin': 'WI', - 'Wyoming': 'WY', - #'Puerto Rico': 'PR', - #'Virgin Islands': 'VI', - #'Guam': 'GU', + "Alabama": "AL", + "Alaska": "AK", + "Arizona": "AZ", + "Arkansas": "AR", + "California": "CA", + "Colorado": "CO", + "Connecticut": "CT", + "Delaware": "DE", + "District of Columbia": "DC", + "Florida": "FL", + "Georgia": "GA", + "Hawaii": "HI", + "Idaho": "ID", + "Illinois": "IL", + "Indiana": "IN", + "Iowa": "IA", + "Kansas": "KS", + "Kentucky": "KY", + "Louisiana": "LA", + "Maine": "ME", + "Maryland": "MD", + "Massachusetts": "MA", + "Michigan": "MI", + "Minnesota": "MN", + "Mississippi": "MS", + "Missouri": "MO", + "Montana": "MT", + "Nebraska": "NE", + "Nevada": "NV", + "New Hampshire": "NH", + "New Jersey": "NJ", + "New Mexico": "NM", + "New York": "NY", + "North Carolina": "NC", + "North Dakota": "ND", + "Ohio": "OH", + "Oklahoma": "OK", + "Oregon": "OR", + "Pennsylvania": "PA", + "Rhode Island": "RI", + "South Carolina": "SC", + "South Dakota": "SD", + "Tennessee": "TN", + "Texas": "TX", + "Utah": "UT", + "Vermont": "VT", + "Virginia": "VA", + "Washington": "WA", + "West Virginia": "WV", + "Wisconsin": "WI", + "Wyoming": "WY", + #'Puerto Rico': 'PR', + #'Virgin Islands': 'VI', + #'Guam': 'GU', } -sql_cdc = ''' +sql_cdc = """ INSERT INTO `cdc` (`date`, `page`, `state`, `num`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s -''' +""" -sql_cdc_meta = ''' +sql_cdc_meta = """ INSERT INTO `cdc_meta` (`date`, `epiweek`, `state`, `total`) VALUES (%s, yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = %s -''' +""" def upload(test_mode): - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # insert (or update) table `cdc` - def insert_cdc(date, page, state, num): - cur.execute(sql_cdc, (date, page, state, num, num)) - - # insert (or update) table `cdc_meta` - def insert_cdc_meta(date, state, total): - cur.execute(sql_cdc_meta, (date, date, state, total, total)) - - # loop over rows until the header row is found - def find_header(reader): - for row in reader: - if len(row) > 0 and row[0] == 'Date': - return True - return False - - # parse csv files for `cdc` and `cdc_meta` - def parse_csv(meta): - def handler(reader): - if not find_header(reader): - raise Exception('header not found') - count = 0 - cols = 3 if meta else 4 - for row in reader: - if len(row) != cols: - continue - if meta: - (a, c, d) = row - else: - (a, b, c, d) = row - c = c[:-16] - if c not in STATES: - continue - a = datetime.strptime(a, '%b %d, %Y').strftime('%Y-%m-%d') - c = STATES[c] - d = int(d) - if meta: - insert_cdc_meta(a, c, d) + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # insert (or update) table `cdc` + def insert_cdc(date, page, state, num): + cur.execute(sql_cdc, (date, page, state, num, num)) + + # insert (or update) table `cdc_meta` + def insert_cdc_meta(date, state, total): + cur.execute(sql_cdc_meta, (date, date, state, total, total)) + + # loop over rows until the header row is found + def find_header(reader): + for row in reader: + if len(row) > 0 and row[0] == "Date": + return True + return False + + # parse csv files for `cdc` and `cdc_meta` + def parse_csv(meta): + def handler(reader): + if not find_header(reader): + raise Exception("header not found") + count = 0 + cols = 3 if meta else 4 + for row in reader: + if len(row) != cols: + continue + if meta: + (a, c, d) = row + else: + (a, b, c, d) = row + c = c[:-16] + if c not in STATES: + continue + a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d") + c = STATES[c] + d = int(d) + if meta: + insert_cdc_meta(a, c, d) + else: + insert_cdc(a, b, c, d) + count += 1 + return count + + return handler + + # recursively open zip files + def parse_zip(zf, level=1): + for name in zf.namelist(): + prefix = " " * level + print(prefix, name) + if name[-4:] == ".zip": + with zf.open(name) as temp: + with ZipFile(io.BytesIO(temp.read())) as zf2: + parse_zip(zf2, level + 1) + elif name[-4:] == ".csv": + handler = None + if "Flu Pages by Region" in name: + handler = parse_csv(False) + elif "Regions for all CDC" in name: + handler = parse_csv(True) + else: + print(prefix, " (skipped)") + if handler is not None: + with zf.open(name) as temp: + count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8")))) + print(prefix, " %d rows" % count) + else: + print(prefix, " (ignored)") + + # find, parse, and move zip files + zip_files = glob.glob("/common/cdc_stage/*.zip") + print("searching...") + for f in zip_files: + print(" ", f) + print("parsing...") + for f in zip_files: + with ZipFile(f) as zf: + parse_zip(zf) + print("moving...") + for f in zip_files: + src = f + dst = os.path.join("/home/automation/cdc_page_stats/", os.path.basename(src)) + print(" ", src, "->", dst) + if test_mode: + print(" (test mode enabled - not moved)") else: - insert_cdc(a, b, c, d) - count += 1 - return count - return handler - - # recursively open zip files - def parse_zip(zf, level=1): - for name in zf.namelist(): - prefix = ' ' * level - print(prefix, name) - if name[-4:] == '.zip': - with zf.open(name) as temp: - with ZipFile(io.BytesIO(temp.read())) as zf2: - parse_zip(zf2, level + 1) - elif name[-4:] == '.csv': - handler = None - if 'Flu Pages by Region' in name: - handler = parse_csv(False) - elif 'Regions for all CDC' in name: - handler = parse_csv(True) - else: - print(prefix, ' (skipped)') - if handler is not None: - with zf.open(name) as temp: - count = handler(csv.reader(io.StringIO(str(temp.read(), 'utf-8')))) - print(prefix, ' %d rows' % count) - else: - print(prefix, ' (ignored)') - - # find, parse, and move zip files - zip_files = glob.glob('/common/cdc_stage/*.zip') - print('searching...') - for f in zip_files: - print(' ', f) - print('parsing...') - for f in zip_files: - with ZipFile(f) as zf: - parse_zip(zf) - print('moving...') - for f in zip_files: - src = f - dst = os.path.join('/home/automation/cdc_page_stats/', os.path.basename(src)) - print(' ', src, '->', dst) - if test_mode: - print(' (test mode enabled - not moved)') - else: - shutil.move(src, dst) - if not os.path.isfile(dst): - raise Exception('unable to move file') - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + shutil.move(src, dst) + if not os.path.isfile(dst): + raise Exception("unable to move file") + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--test", "-t", default=False, action="store_true", help="dry run only") + args = parser.parse_args() - # make it happen - upload(args.test) + # make it happen + upload(args.test) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() From 9e6ff16f599e8feec34a08dd1bddbc5eae347b55 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:50:26 -0700 Subject: [PATCH 19/43] style(black): format covidcast_nowcast acquisition --- src/acquisition/covidcast_nowcast/load_sensors.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/acquisition/covidcast_nowcast/load_sensors.py b/src/acquisition/covidcast_nowcast/load_sensors.py index 73ce7eee5..2e2269bb8 100644 --- a/src/acquisition/covidcast_nowcast/load_sensors.py +++ b/src/acquisition/covidcast_nowcast/load_sensors.py @@ -82,8 +82,7 @@ def load_and_prepare_file(filepath: str, attributes: PathDetails) -> pd.DataFram def _move_after_processing(filepath, success): archive_dir = SUCCESS_DIR if success else FAIL_DIR - new_dir = os.path.dirname(filepath).replace( - "receiving", archive_dir) + new_dir = os.path.dirname(filepath).replace("receiving", archive_dir) os.makedirs(new_dir, exist_ok=True) move(filepath, filepath.replace("receiving", archive_dir)) print(f"{filepath} moved to {archive_dir}") @@ -96,10 +95,14 @@ def method(table, conn, keys, data_iter): meta, # specify lag column explicitly; lag is a reserved word sqlalchemy doesn't know about sqlalchemy.Column("lag", sqlalchemy.Integer, quote=True), - autoload=True) - insert_stmt = sqlalchemy.dialects.mysql.insert(sql_table).values([dict(zip(keys, data)) for data in data_iter]) + autoload=True, + ) + insert_stmt = sqlalchemy.dialects.mysql.insert(sql_table).values( + [dict(zip(keys, data)) for data in data_iter] + ) upsert_stmt = insert_stmt.on_duplicate_key_update({x.name: x for x in insert_stmt.inserted}) conn.execute(upsert_stmt) + return method From d1141d904da4e62992b97c92d5caebd8fadffd42 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:51:28 -0700 Subject: [PATCH 20/43] style(black): format ecdc acquisition --- src/acquisition/ecdc/ecdc_db_update.py | 98 +++++++++++++------------- src/acquisition/ecdc/ecdc_ili.py | 68 +++++++++++------- 2 files changed, 91 insertions(+), 75 deletions(-) diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 63689c1d5..6e0083ecc 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -33,9 +33,8 @@ import argparse import datetime import glob -import subprocess -import random import os +import tempfile # third party import mysql.connector @@ -46,12 +45,14 @@ from delphi.utils.epiweek import delta_epiweeks from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `ecdc_ili` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -62,58 +63,63 @@ def ensure_tables_exist(): `incidence_rate` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='ecdc_ili'): - # Count and return the number of rows in the `ecdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="ecdc_ili"): + # Count and return the number of rows in the `ecdc_ili` table. + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def update_from_file(issue, date, dir, test_mode=False): # Read ECDC data from CSVs and insert into (or update) the database. # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, 'ecdc_ili') - print('rows before: %d' % (rows1)) + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, "ecdc_ili") + print("rows before: %d" % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows - files = glob.glob(os.path.join(dir,"*.csv")) + files = glob.glob(os.path.join(dir, "*.csv")) rows = [] for filename in files: - with open(filename,'r') as f: + with open(filename) as f: for l in f: - data = list(map(lambda s: s.strip().replace('"',''),l.split(','))) + data = list(map(lambda s: s.strip().replace('"', ""), l.split(","))) row = {} - row['epiweek'] = int(data[1][:4] + data[1][5:]) - row['region'] = data[4] - row['incidence_rate'] = data[3] + row["epiweek"] = int(data[1][:4] + data[1][5:]) + row["region"] = data[4] + row["incidence_rate"] = data[3] rows.append(row) - print(' loaded %d rows' % len(rows)) + print(" loaded %d rows" % len(rows)) entries = [obj for obj in rows if obj] - print(' found %d entries' % len(entries)) + print(" found %d entries" % len(entries)) - sql = ''' + sql = """ INSERT INTO `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `incidence_rate`) @@ -122,13 +128,13 @@ def update_from_file(issue, date, dir, test_mode=False): ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `incidence_rate` = %s - ''' + """ for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - data_args = [row['incidence_rate']] + lag = delta_epiweeks(row["epiweek"], issue) + data_args = [row["incidence_rate"]] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row["epiweek"], row["region"], lag] + data_args update_args = [date] + data_args try: insert.execute(sql % tuple(insert_args + update_args)) @@ -138,39 +144,34 @@ def update_from_file(issue, date, dir, test_mode=False): # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' + "--test", action="store_true", help="do dry run only, do not update the database" ) parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' + "--file", type=str, help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' + "--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given" ) args = parser.parse_args() if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') + raise Exception("--file and --issue must both be present or absent") - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) ensure_tables_exist() if args.file: @@ -204,7 +205,8 @@ def main(): if not db_error: break # Exit loop with success if flag >= max_tries: - print('WARNING: Database `ecdc_ili` did not update successfully') + print("WARNING: Database `ecdc_ili` did not update successfully") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/ecdc/ecdc_ili.py b/src/acquisition/ecdc/ecdc_ili.py index 1dd0505d1..dca9b51ae 100644 --- a/src/acquisition/ecdc/ecdc_ili.py +++ b/src/acquisition/ecdc/ecdc_ili.py @@ -11,60 +11,74 @@ from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import Select -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC -def download_ecdc_data(download_dir = "downloads"): - url = 'https://flunewseurope.org/PrimaryCareData' +def download_ecdc_data(download_dir="downloads"): + url = "https://flunewseurope.org/PrimaryCareData" resp = requests.get(url) - soup = BeautifulSoup(resp.content, 'lxml') - mydivs = soup.findAll('div') + soup = BeautifulSoup(resp.content, "lxml") + mydivs = soup.findAll("div") for div in mydivs: dic = div.attrs - if dic.get('class')== ['graph-container'] and dic.get('id')== 'dinfl06': + if dic.get("class") == ["graph-container"] and dic.get("id") == "dinfl06": break # get new url of the ILI chunck - url = div.contents[1].attrs['src'] + url = div.contents[1].attrs["src"] opts = webdriver.firefox.options.Options() opts.set_headless() fp = webdriver.FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(download_dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(download_dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") try: - driver = webdriver.Firefox(options=opts,firefox_profile=fp) + driver = webdriver.Firefox(options=opts, firefox_profile=fp) driver.get(url) for i in range(2, 54): # select country try: - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl03_ddValue'))) - Select(driver.find_element_by_tag_name('select')).select_by_value(str(i)) + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.ID, "fluNewsReportViewer_ctl04_ctl03_ddValue")) + ) + Select(driver.find_element_by_tag_name("select")).select_by_value(str(i)) time.sleep(3) - soup = BeautifulSoup(driver.page_source, 'html.parser') - options = soup.select('#fluNewsReportViewer_ctl04_ctl05_ddValue')[0].find_all('option') + soup = BeautifulSoup(driver.page_source, "html.parser") + options = soup.select("#fluNewsReportViewer_ctl04_ctl05_ddValue")[0].find_all( + "option" + ) ind = 1 for j in range(len(options)): - if 'ILI' in str(options[j]): - pattern = re.compile(r'\d+') + if "ILI" in str(options[j]): + pattern = re.compile(r"\d+") ind = re.findall(pattern, str(options[j]))[0] break if type(ind) == str: # select clinical tyle - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl05_ddValue'))) - Select(driver.find_element_by_id('fluNewsReportViewer_ctl04_ctl05_ddValue')).select_by_value(ind) - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnSelectExportType'))) - driver.find_element_by_id('btnSelectExportType').click() - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnExportToCsv'))) - driver.find_element_by_id('btnExportToCsv').click() + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable( + (By.ID, "fluNewsReportViewer_ctl04_ctl05_ddValue") + ) + ) + Select( + driver.find_element_by_id("fluNewsReportViewer_ctl04_ctl05_ddValue") + ).select_by_value(ind) + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.ID, "btnSelectExportType")) + ) + driver.find_element_by_id("btnSelectExportType").click() + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.ID, "btnExportToCsv")) + ) + driver.find_element_by_id("btnExportToCsv").click() time.sleep(3) except: driver.get(url) except: - print('WARNING: ECDC Scraper may not have downloaded all of the available data.') - #cleanup - os.system('''pkill "firefox" ''') + print("WARNING: ECDC Scraper may not have downloaded all of the available data.") + # cleanup + os.system("""pkill "firefox" """) os.system('''pkill "(firefox-bin)"''') os.system('''pkill "geckodriver*"''') From 08af0f6b7bff85bbc2b193b63b5abf6a16ba03e4 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:52:50 -0700 Subject: [PATCH 21/43] style(black): format flusurv acquisition --- src/acquisition/flusurv/flusurv.py | 277 +++++++++++----------- src/acquisition/flusurv/flusurv_update.py | 193 ++++++++------- 2 files changed, 235 insertions(+), 235 deletions(-) diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py index 6b8d247ae..1e534b740 100644 --- a/src/acquisition/flusurv/flusurv.py +++ b/src/acquisition/flusurv/flusurv.py @@ -50,167 +50,170 @@ # all currently available FluSurv locations and their associated codes # the number pair represents NetworkID and CatchmentID location_codes = { - 'CA': (2, 1), - 'CO': (2, 2), - 'CT': (2, 3), - 'GA': (2, 4), - 'IA': (3, 5), - 'ID': (3, 6), - 'MD': (2, 7), - 'MI': (3, 8), - 'MN': (2, 9), - 'NM': (2, 11), - 'NY_albany': (2, 13), - 'NY_rochester': (2, 14), - 'OH': (3, 15), - 'OK': (3, 16), - 'OR': (2, 17), - 'RI': (3, 18), - 'SD': (3, 19), - 'TN': (2, 20), - 'UT': (3, 21), - 'network_all': (1, 22), - 'network_eip': (2, 22), - 'network_ihsp': (3, 22), + "CA": (2, 1), + "CO": (2, 2), + "CT": (2, 3), + "GA": (2, 4), + "IA": (3, 5), + "ID": (3, 6), + "MD": (2, 7), + "MI": (3, 8), + "MN": (2, 9), + "NM": (2, 11), + "NY_albany": (2, 13), + "NY_rochester": (2, 14), + "OH": (3, 15), + "OK": (3, 16), + "OR": (2, 17), + "RI": (3, 18), + "SD": (3, 19), + "TN": (2, 20), + "UT": (3, 21), + "network_all": (1, 22), + "network_eip": (2, 22), + "network_ihsp": (3, 22), } def fetch_json(path, payload, call_count=1, requests_impl=requests): - """Send a request to the server and return the parsed JSON response.""" - - # it's polite to self-identify this "bot" - delphi_url = 'https://delphi.cmu.edu/index.html' - user_agent = 'Mozilla/5.0 (compatible; delphibot/1.0; +%s)' % delphi_url - - # the FluSurv AMF server - flusurv_url = 'https://gis.cdc.gov/GRASP/Flu3/' + path - - # request headers - headers = { - 'Accept-Encoding': 'gzip', - 'User-Agent': user_agent, - } - if payload is not None: - headers['Content-Type'] = 'application/json;charset=UTF-8' - - # send the request and read the response - if payload is None: - method = requests_impl.get - data = None - else: - method = requests_impl.post - data = json.dumps(payload) - resp = method(flusurv_url, headers=headers, data=data) - - # check the HTTP status code - if resp.status_code == 500 and call_count <= 2: - # the server often fails with this status, so wait and retry - delay = 10 * call_count - print('got status %d, will retry in %d sec...' % (resp.status_code, delay)) - time.sleep(delay) - return fetch_json(path, payload, call_count=call_count + 1) - elif resp.status_code != 200: - raise Exception(['status code != 200', resp.status_code]) - - # check response mime type - if 'application/json' not in resp.headers.get('Content-Type', ''): - raise Exception('response is not json') - - # return the decoded json object - return resp.json() + """Send a request to the server and return the parsed JSON response.""" + + # it's polite to self-identify this "bot" + delphi_url = "https://delphi.cmu.edu/index.html" + user_agent = "Mozilla/5.0 (compatible; delphibot/1.0; +%s)" % delphi_url + + # the FluSurv AMF server + flusurv_url = "https://gis.cdc.gov/GRASP/Flu3/" + path + + # request headers + headers = { + "Accept-Encoding": "gzip", + "User-Agent": user_agent, + } + if payload is not None: + headers["Content-Type"] = "application/json;charset=UTF-8" + + # send the request and read the response + if payload is None: + method = requests_impl.get + data = None + else: + method = requests_impl.post + data = json.dumps(payload) + resp = method(flusurv_url, headers=headers, data=data) + + # check the HTTP status code + if resp.status_code == 500 and call_count <= 2: + # the server often fails with this status, so wait and retry + delay = 10 * call_count + print("got status %d, will retry in %d sec..." % (resp.status_code, delay)) + time.sleep(delay) + return fetch_json(path, payload, call_count=call_count + 1) + elif resp.status_code != 200: + raise Exception(["status code != 200", resp.status_code]) + + # check response mime type + if "application/json" not in resp.headers.get("Content-Type", ""): + raise Exception("response is not json") + + # return the decoded json object + return resp.json() def fetch_flusurv_object(location_code): - """Return decoded FluSurv JSON object for the given location.""" - return fetch_json('PostPhase03GetData', { - 'appversion': 'Public', - 'networkid': location_code[0], - 'cacthmentid': location_code[1], - }) + """Return decoded FluSurv JSON object for the given location.""" + return fetch_json( + "PostPhase03GetData", + { + "appversion": "Public", + "networkid": location_code[0], + "cacthmentid": location_code[1], + }, + ) def mmwrid_to_epiweek(mmwrid): - """Convert a CDC week index into an epiweek.""" + """Convert a CDC week index into an epiweek.""" - # Add the difference in IDs, which are sequential, to a reference epiweek, - # which is 2003w40 in this case. - epiweek_200340 = EpiDate(2003, 9, 28) - mmwrid_200340 = 2179 - return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() + # Add the difference in IDs, which are sequential, to a reference epiweek, + # which is 2003w40 in this case. + epiweek_200340 = EpiDate(2003, 9, 28) + mmwrid_200340 = 2179 + return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() def extract_from_object(data_in): - """ - Given a FluSurv data object, return hospitaliation rates. - - The returned object is indexed first by epiweek, then by zero-indexed age - group. - """ - - # an object to hold the result - data_out = {} - - # iterate over all seasons and age groups - for obj in data_in['busdata']['dataseries']: - if obj['age'] in (10, 11, 12): - # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): - # capture as-of-yet undefined age groups 10, 11, and 12 - continue - age_index = obj['age'] - 1 - # iterage over weeks - for mmwrid, _, _, rate in obj['data']: - epiweek = mmwrid_to_epiweek(mmwrid) - if epiweek not in data_out: - # weekly rate of each age group - data_out[epiweek] = [None] * 9 - prev_rate = data_out[epiweek][age_index] - if prev_rate is None: - # this is the first time to see a rate for this epiweek/age - data_out[epiweek][age_index] = rate - elif prev_rate != rate: - # a different rate was already found for this epiweek/age - format_args = (epiweek, obj['age'], prev_rate, rate) - print('warning: %d %d %f != %f' % format_args) - - # sanity check the result - if len(data_out) == 0: - raise Exception('no data found') - - # print the result and return flu data - print('found data for %d weeks' % len(data_out)) - return data_out + """ + Given a FluSurv data object, return hospitaliation rates. + + The returned object is indexed first by epiweek, then by zero-indexed age + group. + """ + + # an object to hold the result + data_out = {} + + # iterate over all seasons and age groups + for obj in data_in["busdata"]["dataseries"]: + if obj["age"] in (10, 11, 12): + # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): + # capture as-of-yet undefined age groups 10, 11, and 12 + continue + age_index = obj["age"] - 1 + # iterage over weeks + for mmwrid, _, _, rate in obj["data"]: + epiweek = mmwrid_to_epiweek(mmwrid) + if epiweek not in data_out: + # weekly rate of each age group + data_out[epiweek] = [None] * 9 + prev_rate = data_out[epiweek][age_index] + if prev_rate is None: + # this is the first time to see a rate for this epiweek/age + data_out[epiweek][age_index] = rate + elif prev_rate != rate: + # a different rate was already found for this epiweek/age + format_args = (epiweek, obj["age"], prev_rate, rate) + print("warning: %d %d %f != %f" % format_args) + + # sanity check the result + if len(data_out) == 0: + raise Exception("no data found") + + # print the result and return flu data + print("found data for %d weeks" % len(data_out)) + return data_out def get_data(location_code): - """ - Fetch and parse flu data for the given location. + """ + Fetch and parse flu data for the given location. - This method performs the following operations: - - fetches FluSurv data from CDC - - extracts and returns hospitaliation rates - """ + This method performs the following operations: + - fetches FluSurv data from CDC + - extracts and returns hospitaliation rates + """ - # fetch - print('[fetching flusurv data...]') - data_in = fetch_flusurv_object(location_code) + # fetch + print("[fetching flusurv data...]") + data_in = fetch_flusurv_object(location_code) - # extract - print('[extracting values...]') - data_out = extract_from_object(data_in) + # extract + print("[extracting values...]") + data_out = extract_from_object(data_in) - # return - print('[scraped successfully]') - return data_out + # return + print("[scraped successfully]") + return data_out def get_current_issue(): - """Scrape the current issue from the FluSurv main page.""" + """Scrape the current issue from the FluSurv main page.""" - # fetch - data = fetch_json('GetPhase03InitApp?appVersion=Public', None) + # fetch + data = fetch_json("GetPhase03InitApp?appVersion=Public", None) - # extract - date = datetime.strptime(data['loaddatetime'], '%b %d, %Y') + # extract + date = datetime.strptime(data["loaddatetime"], "%b %d, %Y") - # convert and return - return EpiDate(date.year, date.month, date.day).get_ew() + # convert and return + return EpiDate(date.year, date.month, date.day).get_ew() diff --git a/src/acquisition/flusurv/flusurv_update.py b/src/acquisition/flusurv/flusurv_update.py index 35fadba05..295091104 100644 --- a/src/acquisition/flusurv/flusurv_update.py +++ b/src/acquisition/flusurv/flusurv_update.py @@ -82,108 +82,105 @@ def get_rows(cur): - """Return the number of rows in the `flusurv` table.""" + """Return the number of rows in the `flusurv` table.""" - # count all rows - cur.execute('SELECT count(1) `num` FROM `flusurv`') - for (num,) in cur: - return num + # count all rows + cur.execute("SELECT count(1) `num` FROM `flusurv`") + for (num,) in cur: + return num def update(issue, location_name, test_mode=False): - """Fetch and store the currently avialble weekly FluSurv dataset.""" - - # fetch data - location_code = flusurv.location_codes[location_name] - print('fetching data for', location_name, location_code) - data = flusurv.get_data(location_code) - - # metadata - epiweeks = sorted(data.keys()) - location = location_name - release_date = str(EpiDate.today()) - - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect( - host=secrets.db.host, user=u, password=p, database='epidata') - cur = cnx.cursor() - rows1 = get_rows(cur) - print('rows before: %d' % rows1) - - # SQL for insert/update - sql = ''' - INSERT INTO `flusurv` ( - `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, - `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, - `rate_age_5`, `rate_age_6`, `rate_age_7` - ) - VALUES ( - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s - ) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `rate_age_0` = coalesce(%s, `rate_age_0`), - `rate_age_1` = coalesce(%s, `rate_age_1`), - `rate_age_2` = coalesce(%s, `rate_age_2`), - `rate_age_3` = coalesce(%s, `rate_age_3`), - `rate_age_4` = coalesce(%s, `rate_age_4`), - `rate_overall` = coalesce(%s, `rate_overall`), - `rate_age_5` = coalesce(%s, `rate_age_5`), - `rate_age_6` = coalesce(%s, `rate_age_6`), - `rate_age_7` = coalesce(%s, `rate_age_7`) - ''' - - # insert/update each row of data (one per epiweek) - for epiweek in epiweeks: - lag = delta_epiweeks(epiweek, issue) - if lag > 52: - # Ignore values older than one year, as (1) they are assumed not to - # change, and (2) it would adversely affect database performance if all - # values (including duplicates) were stored on each run. - continue - args_meta = [release_date, issue, epiweek, location, lag] - args_insert = data[epiweek] - args_update = [release_date] + data[epiweek] - cur.execute(sql, tuple(args_meta + args_insert + args_update)) - - # commit and disconnect - rows2 = get_rows(cur) - print('rows after: %d (+%d)' % (rows2, rows2 - rows1)) - cur.close() - if test_mode: - print('test mode: not committing database changes') - else: - cnx.commit() - cnx.close() + """Fetch and store the currently avialble weekly FluSurv dataset.""" + + # fetch data + location_code = flusurv.location_codes[location_name] + print("fetching data for", location_name, location_code) + data = flusurv.get_data(location_code) + + # metadata + epiweeks = sorted(data.keys()) + location = location_name + release_date = str(EpiDate.today()) + + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(host=secrets.db.host, user=u, password=p, database="epidata") + cur = cnx.cursor() + rows1 = get_rows(cur) + print("rows before: %d" % rows1) + + # SQL for insert/update + sql = """ + INSERT INTO `flusurv` ( + `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, + `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, + `rate_age_5`, `rate_age_6`, `rate_age_7` + ) + VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s + ) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `rate_age_0` = coalesce(%s, `rate_age_0`), + `rate_age_1` = coalesce(%s, `rate_age_1`), + `rate_age_2` = coalesce(%s, `rate_age_2`), + `rate_age_3` = coalesce(%s, `rate_age_3`), + `rate_age_4` = coalesce(%s, `rate_age_4`), + `rate_overall` = coalesce(%s, `rate_overall`), + `rate_age_5` = coalesce(%s, `rate_age_5`), + `rate_age_6` = coalesce(%s, `rate_age_6`), + `rate_age_7` = coalesce(%s, `rate_age_7`) + """ + + # insert/update each row of data (one per epiweek) + for epiweek in epiweeks: + lag = delta_epiweeks(epiweek, issue) + if lag > 52: + # Ignore values older than one year, as (1) they are assumed not to + # change, and (2) it would adversely affect database performance if all + # values (including duplicates) were stored on each run. + continue + args_meta = [release_date, issue, epiweek, location, lag] + args_insert = data[epiweek] + args_update = [release_date] + data[epiweek] + cur.execute(sql, tuple(args_meta + args_insert + args_update)) + + # commit and disconnect + rows2 = get_rows(cur) + print("rows after: %d (+%d)" % (rows2, rows2 - rows1)) + cur.close() + if test_mode: + print("test mode: not committing database changes") + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'location', - help='location for which data should be scraped (e.g. "CA" or "all")' - ) - parser.add_argument( - '--test', '-t', - default=False, action='store_true', help='do not commit database changes' - ) - args = parser.parse_args() - - # scrape current issue from the main page - issue = flusurv.get_current_issue() - print('current issue: %d' % issue) - - # fetch flusurv data - if args.location == 'all': - # all locations - for location in flusurv.location_codes.keys(): - update(issue, location, args.test) - else: - # single location - update(issue, args.location, args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "location", help='location for which data should be scraped (e.g. "CA" or "all")' + ) + parser.add_argument( + "--test", "-t", default=False, action="store_true", help="do not commit database changes" + ) + args = parser.parse_args() + + # scrape current issue from the main page + issue = flusurv.get_current_issue() + print("current issue: %d" % issue) + + # fetch flusurv data + if args.location == "all": + # all locations + for location in flusurv.location_codes.keys(): + update(issue, location, args.test) + else: + # single location + update(issue, args.location, args.test) + + +if __name__ == "__main__": + main() From 0133ef2042c4df8867e91595eb1f64873edb4632 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:53:10 -0700 Subject: [PATCH 22/43] style(black): format fluview acquisition --- src/acquisition/fluview/fluview.py | 329 ++++---- src/acquisition/fluview/fluview_locations.py | 186 ++--- src/acquisition/fluview/fluview_notify.py | 80 +- src/acquisition/fluview/fluview_update.py | 772 +++++++++--------- .../fluview/impute_missing_values.py | 493 ++++++----- 5 files changed, 947 insertions(+), 913 deletions(-) diff --git a/src/acquisition/fluview/fluview.py b/src/acquisition/fluview/fluview.py index d723cbc59..a7e9fba87 100644 --- a/src/acquisition/fluview/fluview.py +++ b/src/acquisition/fluview/fluview.py @@ -34,183 +34,188 @@ class Key: - """ - Constants for navigating the metadata object contained in the web response - from CDC. - """ + """ + Constants for navigating the metadata object contained in the web response + from CDC. + """ - class TierType: - nat = 'National' - hhs = 'HHS Regions' - cen = 'Census Divisions' - sta = 'State' + class TierType: + nat = "National" + hhs = "HHS Regions" + cen = "Census Divisions" + sta = "State" - class TierListEntry: - hhs = 'hhsregion' - cen = 'censusregions' - sta = 'states' + class TierListEntry: + hhs = "hhsregion" + cen = "censusregions" + sta = "states" - class TierIdEntry: - hhs = 'hhsregionid' - cen = 'censusregionid' - sta = 'stateid' + class TierIdEntry: + hhs = "hhsregionid" + cen = "censusregionid" + sta = "stateid" def check_status(resp, status, content_type): - """Raise an exception if the status code or content type is unexpected.""" - if resp.status_code != status: - raise Exception('got unexpected status code: ' + str(resp.status_code)) - actual_type = resp.headers.get('Content-Type', None) - if actual_type is None or content_type not in actual_type.lower(): - raise Exception('got unexpected content type: ' + str(actual_type)) + """Raise an exception if the status code or content type is unexpected.""" + if resp.status_code != status: + raise Exception("got unexpected status code: " + str(resp.status_code)) + actual_type = resp.headers.get("Content-Type", None) + if actual_type is None or content_type not in actual_type.lower(): + raise Exception("got unexpected content type: " + str(actual_type)) def fetch_metadata(sess): - """ - Return metadata indicating the current issue and also numeric constants - representing the various locations. - """ - url = 'https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public' - resp = sess.get(url) - check_status(resp, 200, 'application/json') - return resp.json() + """ + Return metadata indicating the current issue and also numeric constants + representing the various locations. + """ + url = "https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public" + resp = sess.get(url) + check_status(resp, 200, "application/json") + return resp.json() def get_issue_and_locations(data): - """Extract the issue and per-tier location lists from the metadata object.""" - - def get_tier_ids(name): - for row in data['regiontypes']: - if row['description'] == name: - return row['regiontypeid'] - raise Exception() - - tier_ids = dict((name, get_tier_ids(name)) for name in ( - Key.TierType.nat, - Key.TierType.hhs, - Key.TierType.cen, - Key.TierType.sta, - )) - - location_ids = { - Key.TierType.nat: [0], - Key.TierType.hhs: [], - Key.TierType.cen: [], - Key.TierType.sta: [], - } - - # add location ids for HHS - for row in data[Key.TierListEntry.hhs]: - location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) - location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) - num = len(location_ids[Key.TierType.hhs]) - if num != 10: - raise Exception('expected 10 hhs regions, found %d' % num) - - # add location ids for census divisions - for row in data[Key.TierListEntry.cen]: - location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) - location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) - num = len(location_ids[Key.TierType.cen]) - if num != 9: - raise Exception('expected 9 census divisions, found %d' % num) - - # add location ids for states - for row in data[Key.TierListEntry.sta]: - location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) - location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) - num = len(location_ids[Key.TierType.sta]) - if num != 57: - raise Exception('expected 57 states/territories/cities, found %d' % num) - - # return a useful subset of the metadata - # (latest epiweek, latest season, tier ids, location ids) - return { - 'epiweek': data['mmwr'][-1]['yearweek'], - 'season_id': data['mmwr'][-1]['seasonid'], - 'tier_ids': tier_ids, - 'location_ids': location_ids, - } + """Extract the issue and per-tier location lists from the metadata object.""" + + def get_tier_ids(name): + for row in data["regiontypes"]: + if row["description"] == name: + return row["regiontypeid"] + raise Exception() + + tier_ids = { + name: get_tier_ids(name) + for name in ( + Key.TierType.nat, + Key.TierType.hhs, + Key.TierType.cen, + Key.TierType.sta, + ) + } + + location_ids = { + Key.TierType.nat: [0], + Key.TierType.hhs: [], + Key.TierType.cen: [], + Key.TierType.sta: [], + } + + # add location ids for HHS + for row in data[Key.TierListEntry.hhs]: + location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) + location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) + num = len(location_ids[Key.TierType.hhs]) + if num != 10: + raise Exception("expected 10 hhs regions, found %d" % num) + + # add location ids for census divisions + for row in data[Key.TierListEntry.cen]: + location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) + location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) + num = len(location_ids[Key.TierType.cen]) + if num != 9: + raise Exception("expected 9 census divisions, found %d" % num) + + # add location ids for states + for row in data[Key.TierListEntry.sta]: + location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) + location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) + num = len(location_ids[Key.TierType.sta]) + if num != 57: + raise Exception("expected 57 states/territories/cities, found %d" % num) + + # return a useful subset of the metadata + # (latest epiweek, latest season, tier ids, location ids) + return { + "epiweek": data["mmwr"][-1]["yearweek"], + "season_id": data["mmwr"][-1]["seasonid"], + "tier_ids": tier_ids, + "location_ids": location_ids, + } def download_data(tier_id, location_ids, season_ids, filename): - """Download zipped ILINet data for the given locations and seasons.""" - - def get_entry(num, name=None): - return {'ID': num, 'Name': (name if name else num)} - - # download the data (in memory) - url = 'https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload' - data = { - 'AppVersion': 'Public', - 'DatasourceDT': [get_entry(1, 'ILINet'), get_entry(0, 'WHO_NREVSS')], - 'RegionTypeId': tier_id, - 'SubRegionsDT': [get_entry(loc) for loc in sorted(location_ids)], - 'SeasonsDT': [get_entry(season) for season in sorted(season_ids)], - } - resp = requests.post(url, json=data) - check_status(resp, 200, 'application/octet-stream') - payload = resp.content - - # save the data to file and return the file length - with open(filename, 'wb') as f: - f.write(payload) - return len(payload) + """Download zipped ILINet data for the given locations and seasons.""" + + def get_entry(num, name=None): + return {"ID": num, "Name": (name if name else num)} + + # download the data (in memory) + url = "https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload" + data = { + "AppVersion": "Public", + "DatasourceDT": [get_entry(1, "ILINet"), get_entry(0, "WHO_NREVSS")], + "RegionTypeId": tier_id, + "SubRegionsDT": [get_entry(loc) for loc in sorted(location_ids)], + "SeasonsDT": [get_entry(season) for season in sorted(season_ids)], + } + resp = requests.post(url, json=data) + check_status(resp, 200, "application/octet-stream") + payload = resp.content + + # save the data to file and return the file length + with open(filename, "wb") as f: + f.write(payload) + return len(payload) def save_latest(path=None): - """ - Save the latest two seasons of data for all locations, separately for each - location tier (i.e. national, HHS, census, and states). - """ - - # set up the session - sess = requests.session() - sess.headers.update({ - # it's polite to self-identify this "bot" - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - }) - - # get metatdata - print('looking up ilinet metadata') - data = fetch_metadata(sess) - info = get_issue_and_locations(data) - issue = info['epiweek'] - print('current issue: %d' % issue) - - # establish timing - dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - current_season = info['season_id'] - seasons = [s for s in range(current_season - 1, current_season + 1)] - - # make the destination path if it doesn't already exist - if path is not None: - os.makedirs(path, exist_ok=True) - - # download the data file for each tier - files = [] - for delphi_name, cdc_name in ( - ('nat', Key.TierType.nat), - ('hhs', Key.TierType.hhs), - ('cen', Key.TierType.cen), - ('sta', Key.TierType.sta), - ): - name = 'ilinet_%s_%d_%s.zip' % (delphi_name, issue, dt) - if path is None: - filename = name - else: - filename = os.path.join(path, name) - tier_id = info['tier_ids'][cdc_name] - locations = info['location_ids'][cdc_name] - - # download and show timing information - print('downloading %s' % delphi_name) - t0 = time.time() - size = download_data(tier_id, locations, seasons, filename) - t1 = time.time() - - print(' saved %s (%d bytes in %.1f seconds)' % (filename, size, t1 - t0)) - files.append(filename) - - # return the current issue and the list of downloaded files - return issue, files + """ + Save the latest two seasons of data for all locations, separately for each + location tier (i.e. national, HHS, census, and states). + """ + + # set up the session + sess = requests.session() + sess.headers.update( + { + # it's polite to self-identify this "bot" + "User-Agent": "delphibot/1.0 (+https://delphi.cmu.edu/)", + } + ) + + # get metatdata + print("looking up ilinet metadata") + data = fetch_metadata(sess) + info = get_issue_and_locations(data) + issue = info["epiweek"] + print("current issue: %d" % issue) + + # establish timing + dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + current_season = info["season_id"] + seasons = [s for s in range(current_season - 1, current_season + 1)] + + # make the destination path if it doesn't already exist + if path is not None: + os.makedirs(path, exist_ok=True) + + # download the data file for each tier + files = [] + for delphi_name, cdc_name in ( + ("nat", Key.TierType.nat), + ("hhs", Key.TierType.hhs), + ("cen", Key.TierType.cen), + ("sta", Key.TierType.sta), + ): + name = "ilinet_%s_%d_%s.zip" % (delphi_name, issue, dt) + if path is None: + filename = name + else: + filename = os.path.join(path, name) + tier_id = info["tier_ids"][cdc_name] + locations = info["location_ids"][cdc_name] + + # download and show timing information + print("downloading %s" % delphi_name) + t0 = time.time() + size = download_data(tier_id, locations, seasons, filename) + t1 = time.time() + + print(" saved %s (%d bytes in %.1f seconds)" % (filename, size, t1 - t0)) + files.append(filename) + + # return the current issue and the list of downloaded files + return issue, files diff --git a/src/acquisition/fluview/fluview_locations.py b/src/acquisition/fluview/fluview_locations.py index 9c851bc6f..e5ebe0fc3 100644 --- a/src/acquisition/fluview/fluview_locations.py +++ b/src/acquisition/fluview/fluview_locations.py @@ -15,100 +15,100 @@ # https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public # The values are used in queries of Delphi's Epidata API. cdc_to_delphi = { - 'national': { - 'x': 'nat', - }, - 'hhs regions': { - 'region 1': 'hhs1', - 'region 2': 'hhs2', - 'region 3': 'hhs3', - 'region 4': 'hhs4', - 'region 5': 'hhs5', - 'region 6': 'hhs6', - 'region 7': 'hhs7', - 'region 8': 'hhs8', - 'region 9': 'hhs9', - 'region 10': 'hhs10', - }, - 'census regions': { - 'new england': 'cen1', - 'mid-atlantic': 'cen2', - 'east north central': 'cen3', - 'west north central': 'cen4', - 'south atlantic': 'cen5', - 'east south central': 'cen6', - 'west south central': 'cen7', - 'mountain': 'cen8', - 'pacific': 'cen9', - }, - 'states': { - # states/territories: two-letter ISO 3166 - 'alabama': 'al', - 'alaska': 'ak', - 'arizona': 'az', - 'arkansas': 'ar', - 'california': 'ca', - 'colorado': 'co', - 'connecticut': 'ct', - 'delaware': 'de', - 'florida': 'fl', - 'georgia': 'ga', - 'hawaii': 'hi', - 'idaho': 'id', - 'illinois': 'il', - 'indiana': 'in', - 'iowa': 'ia', - 'kansas': 'ks', - 'kentucky': 'ky', - 'louisiana': 'la', - 'maine': 'me', - 'maryland': 'md', - 'massachusetts': 'ma', - 'michigan': 'mi', - 'minnesota': 'mn', - 'mississippi': 'ms', - 'missouri': 'mo', - 'montana': 'mt', - 'nebraska': 'ne', - 'nevada': 'nv', - 'new hampshire': 'nh', - 'new jersey': 'nj', - 'new mexico': 'nm', - # Even though it's called "New York", this location doesn't include New - # York City ("jfk"). New York ("ny") is actually this *plus* jfk. - 'new york': 'ny_minus_jfk', - 'north carolina': 'nc', - 'north dakota': 'nd', - 'ohio': 'oh', - 'oklahoma': 'ok', - 'oregon': 'or', - 'pennsylvania': 'pa', - 'rhode island': 'ri', - 'south carolina': 'sc', - 'south dakota': 'sd', - 'tennessee': 'tn', - 'texas': 'tx', - 'utah': 'ut', - 'vermont': 'vt', - 'virginia': 'va', - 'washington': 'wa', - 'west virginia': 'wv', - 'wisconsin': 'wi', - 'wyoming': 'wy', - 'american samoa': 'as', - 'commonwealth of the northern mariana islands': 'mp', - 'district of columbia': 'dc', - 'guam': 'gu', - 'puerto rico': 'pr', - 'virgin islands': 'vi', - # cities: three-letter IATA - 'chicago': 'ord', - 'los angeles': 'lax', - 'new york city': 'jfk', - }, + "national": { + "x": "nat", + }, + "hhs regions": { + "region 1": "hhs1", + "region 2": "hhs2", + "region 3": "hhs3", + "region 4": "hhs4", + "region 5": "hhs5", + "region 6": "hhs6", + "region 7": "hhs7", + "region 8": "hhs8", + "region 9": "hhs9", + "region 10": "hhs10", + }, + "census regions": { + "new england": "cen1", + "mid-atlantic": "cen2", + "east north central": "cen3", + "west north central": "cen4", + "south atlantic": "cen5", + "east south central": "cen6", + "west south central": "cen7", + "mountain": "cen8", + "pacific": "cen9", + }, + "states": { + # states/territories: two-letter ISO 3166 + "alabama": "al", + "alaska": "ak", + "arizona": "az", + "arkansas": "ar", + "california": "ca", + "colorado": "co", + "connecticut": "ct", + "delaware": "de", + "florida": "fl", + "georgia": "ga", + "hawaii": "hi", + "idaho": "id", + "illinois": "il", + "indiana": "in", + "iowa": "ia", + "kansas": "ks", + "kentucky": "ky", + "louisiana": "la", + "maine": "me", + "maryland": "md", + "massachusetts": "ma", + "michigan": "mi", + "minnesota": "mn", + "mississippi": "ms", + "missouri": "mo", + "montana": "mt", + "nebraska": "ne", + "nevada": "nv", + "new hampshire": "nh", + "new jersey": "nj", + "new mexico": "nm", + # Even though it's called "New York", this location doesn't include New + # York City ("jfk"). New York ("ny") is actually this *plus* jfk. + "new york": "ny_minus_jfk", + "north carolina": "nc", + "north dakota": "nd", + "ohio": "oh", + "oklahoma": "ok", + "oregon": "or", + "pennsylvania": "pa", + "rhode island": "ri", + "south carolina": "sc", + "south dakota": "sd", + "tennessee": "tn", + "texas": "tx", + "utah": "ut", + "vermont": "vt", + "virginia": "va", + "washington": "wa", + "west virginia": "wv", + "wisconsin": "wi", + "wyoming": "wy", + "american samoa": "as", + "commonwealth of the northern mariana islands": "mp", + "district of columbia": "dc", + "guam": "gu", + "puerto rico": "pr", + "virgin islands": "vi", + # cities: three-letter IATA + "chicago": "ord", + "los angeles": "lax", + "new york city": "jfk", + }, } def get_location_name(region_type, region_name): - """Convert a CDC location type and name pair into a Delphi location name.""" - return cdc_to_delphi[region_type.lower()][region_name.lower()] + """Convert a CDC location type and name pair into a Delphi location name.""" + return cdc_to_delphi[region_type.lower()][region_name.lower()] diff --git a/src/acquisition/fluview/fluview_notify.py b/src/acquisition/fluview/fluview_notify.py index 13f0f3559..a280889a5 100644 --- a/src/acquisition/fluview/fluview_notify.py +++ b/src/acquisition/fluview/fluview_notify.py @@ -31,41 +31,53 @@ import delphi.operations.secrets as secrets -if __name__ == '__main__': - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument('-t', '--test', action='store_const', const=True, default=False, help="do dry run only, don't update the database") - args = parser.parse_args() +if __name__ == "__main__": + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + "--test", + action="store_const", + const=True, + default=False, + help="do dry run only, don't update the database", + ) + args = parser.parse_args() - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() - # get the last known issue from the automation table `variables` - cur.execute('SELECT `value` FROM automation.`variables` WHERE `name` = %s', ('most_recent_issue',)) - for (issue1,) in cur: - issue1 = int(issue1) - print('last known issue:', issue1) - # get the most recent issue from the epidata table `fluview` - cur.execute('SELECT max(`issue`) FROM `fluview`') - for (issue2,) in cur: - issue2 = int(issue2) - print('most recent issue:', issue2) + # get the last known issue from the automation table `variables` + cur.execute( + "SELECT `value` FROM automation.`variables` WHERE `name` = %s", ("most_recent_issue",) + ) + for (issue1,) in cur: + issue1 = int(issue1) + print("last known issue:", issue1) + # get the most recent issue from the epidata table `fluview` + cur.execute("SELECT max(`issue`) FROM `fluview`") + for (issue2,) in cur: + issue2 = int(issue2) + print("most recent issue:", issue2) - if issue2 > issue1: - print('new data is available!') - if args.test: - print('test mode - not making any changes') - else: - # update the variable - cur.execute('UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s', (issue2, 'most_recent_issue')) - # queue the 'New FluView Available' flow - cur.execute('CALL automation.RunStep(36)') - elif issue2 < issue2: - raise Exception('most recent issue is older than the last known issue') + if issue2 > issue1: + print("new data is available!") + if args.test: + print("test mode - not making any changes") + else: + # update the variable + cur.execute( + "UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s", + (issue2, "most_recent_issue"), + ) + # queue the 'New FluView Available' flow + cur.execute("CALL automation.RunStep(36)") + elif issue2 < issue2: + raise Exception("most recent issue is older than the last known issue") - # cleanup - cnx.commit() - cur.close() - cnx.close() + # cleanup + cnx.commit() + cur.close() + cnx.close() diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index 65bec7a40..e463fcbaf 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -130,398 +130,422 @@ from . import fluview_locations # sheet names -ILINET_SHEET = 'ILINet.csv' -PHL_SHEET = 'WHO_NREVSS_Public_Health_Labs.csv' -CL_SHEET = 'WHO_NREVSS_Clinical_Labs.csv' +ILINET_SHEET = "ILINet.csv" +PHL_SHEET = "WHO_NREVSS_Public_Health_Labs.csv" +CL_SHEET = "WHO_NREVSS_Clinical_Labs.csv" # table names -CL_TABLE = 'fluview_clinical' -PHL_TABLE = 'fluview_public' +CL_TABLE = "fluview_clinical" +PHL_TABLE = "fluview_public" + def optional_int(i): - return int(i) if i not in ('', 'X') else None + return int(i) if i not in ("", "X") else None + def optional_float(i, j): - return float(i) if i not in ('', 'X') else float(j) + return float(i) if i not in ("", "X") else float(j) + def nullable_float(i): - return float(i) if i not in ('', 'X') else None + return float(i) if i not in ("", "X") else None + def get_ilinet_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - '% WEIGHTED ILI', - '%UNWEIGHTED ILI', - 'AGE 0-4', - 'AGE 25-49', - 'AGE 25-64', - 'AGE 5-24', - 'AGE 50-64', - 'AGE 65', - 'ILITOTAL', - 'NUM. OF PROVIDERS', - 'TOTAL PATIENTS' - ]: - raise Exception('header row has changed') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[5] == 'X': - # ILI isn't reported, ignore this row - return None - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'wili': optional_float(*row[4:6]), - 'ili': float(row[5]), - 'age0': optional_int(row[6]), - 'age1': optional_int(row[9]), - 'age2': optional_int(row[8]), - 'age3': optional_int(row[7]), - 'age4': optional_int(row[10]), - 'age5': optional_int(row[11]), - 'n_ili': optional_int(row[12]), - 'n_providers': optional_int(row[13]), - 'n_patients': optional_int(row[14]), - } + if row[0] == "REGION TYPE" and row != [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "% WEIGHTED ILI", + "%UNWEIGHTED ILI", + "AGE 0-4", + "AGE 25-49", + "AGE 25-64", + "AGE 5-24", + "AGE 50-64", + "AGE 65", + "ILITOTAL", + "NUM. OF PROVIDERS", + "TOTAL PATIENTS", + ]: + raise Exception("header row has changed") + if len(row) == 1 or row[0] == "REGION TYPE": + # this is a header row + return None + if row[5] == "X": + # ILI isn't reported, ignore this row + return None + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": join_epiweek(int(row[2]), int(row[3])), + "wili": optional_float(*row[4:6]), + "ili": float(row[5]), + "age0": optional_int(row[6]), + "age1": optional_int(row[9]), + "age2": optional_int(row[8]), + "age3": optional_int(row[7]), + "age4": optional_int(row[10]), + "age5": optional_int(row[11]), + "n_ili": optional_int(row[12]), + "n_providers": optional_int(row[13]), + "n_patients": optional_int(row[14]), + } + def get_clinical_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'TOTAL A', - 'TOTAL B', - 'PERCENT POSITIVE', - 'PERCENT A', - 'PERCENT B' - ]: - raise Exception('header row has changed for clinical lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[4] == 'X': - # data is not reported, ignore this row - return None - # ignore percentage calculations for now - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'total_specimens': int(row[4]), - 'total_a': optional_int(row[5]), - 'total_b': optional_int(row[6]), - 'percent_positive': nullable_float(row[7]), - 'percent_a': nullable_float(row[8]), - 'percent_b': nullable_float(row[9]) - } + if row[0] == "REGION TYPE" and row != [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "TOTAL SPECIMENS", + "TOTAL A", + "TOTAL B", + "PERCENT POSITIVE", + "PERCENT A", + "PERCENT B", + ]: + raise Exception("header row has changed for clinical lab data.") + if len(row) == 1 or row[0] == "REGION TYPE": + # this is a header row + return None + if row[4] == "X": + # data is not reported, ignore this row + return None + # ignore percentage calculations for now + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": join_epiweek(int(row[2]), int(row[3])), + "total_specimens": int(row[4]), + "total_a": optional_int(row[5]), + "total_b": optional_int(row[6]), + "percent_positive": nullable_float(row[7]), + "percent_a": nullable_float(row[8]), + "percent_b": nullable_float(row[9]), + } + def get_public_data(row): - hrow1 = [ - 'REGION TYPE', - 'REGION', - 'SEASON_DESCRIPTION', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - hrow2 = [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - if row[0] == 'REGION TYPE' and row != hrow1 and row != hrow2: - raise Exception('header row has changed for public health lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # header row - return None - if row[3] == 'X': - # data is not reported, ignore this row - return None - # handle case where data is reported by season, not by epiweek - is_weekly = len(row) == len(hrow2) - # set epiweek - if is_weekly: - epiweek = join_epiweek(int(row[2]), int(row[3])) - else: - epiweek = int(row[2][7:11]) * 100 + 40 - # row offset - offset = 1 if is_weekly else 0 - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': epiweek, - 'total_specimens': int(row[3 + offset]), - 'total_a_h1n1': optional_int(row[4+ offset]), - 'total_a_h3': optional_int(row[5 + offset]), - 'total_a_h3n2v': optional_int(row[10 + offset]), - 'total_a_no_sub': optional_int(row[6 + offset]), - 'total_b': optional_int(row[7 + offset]), - 'total_b_vic': optional_int(row[8 + offset]), - 'total_b_yam': optional_int(row[9 + offset]) - } - -def load_zipped_csv(filename, sheetname='ILINet.csv'): - """Read rows from a zipped CSV, which is expected to be named as specified - by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" - with zipfile.ZipFile(filename) as f: - with f.open(sheetname) as ff: - return [row for row in csv.reader(io.StringIO(str(ff.read(), 'utf-8')))] - -def get_rows(cnx, table='fluview'): - """Count and return the number of rows in the `fluview` table. - Looking at the fluview table by default, but may pass parameter - to look at public health or clinical lab data instead.""" - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + hrow1 = [ + "REGION TYPE", + "REGION", + "SEASON_DESCRIPTION", + "TOTAL SPECIMENS", + "A (2009 H1N1)", + "A (H3)", + "A (Subtyping not Performed)", + "B", + "BVic", + "BYam", + "H3N2v", + ] + hrow2 = [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "TOTAL SPECIMENS", + "A (2009 H1N1)", + "A (H3)", + "A (Subtyping not Performed)", + "B", + "BVic", + "BYam", + "H3N2v", + ] + if row[0] == "REGION TYPE" and row != hrow1 and row != hrow2: + raise Exception("header row has changed for public health lab data.") + if len(row) == 1 or row[0] == "REGION TYPE": + # header row + return None + if row[3] == "X": + # data is not reported, ignore this row + return None + # handle case where data is reported by season, not by epiweek + is_weekly = len(row) == len(hrow2) + # set epiweek + if is_weekly: + epiweek = join_epiweek(int(row[2]), int(row[3])) + else: + epiweek = int(row[2][7:11]) * 100 + 40 + # row offset + offset = 1 if is_weekly else 0 + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": epiweek, + "total_specimens": int(row[3 + offset]), + "total_a_h1n1": optional_int(row[4 + offset]), + "total_a_h3": optional_int(row[5 + offset]), + "total_a_h3n2v": optional_int(row[10 + offset]), + "total_a_no_sub": optional_int(row[6 + offset]), + "total_b": optional_int(row[7 + offset]), + "total_b_vic": optional_int(row[8 + offset]), + "total_b_yam": optional_int(row[9 + offset]), + } + + +def load_zipped_csv(filename, sheetname="ILINet.csv"): + """Read rows from a zipped CSV, which is expected to be named as specified + by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" + with zipfile.ZipFile(filename) as f: + with f.open(sheetname) as ff: + return [row for row in csv.reader(io.StringIO(str(ff.read(), "utf-8")))] + + +def get_rows(cnx, table="fluview"): + """Count and return the number of rows in the `fluview` table. + Looking at the fluview table by default, but may pass parameter + to look at public health or clinical lab data instead.""" + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def update_from_file_clinical(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, CL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, CL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_clinical_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, - `percent_b`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a` = %s, - `total_b` = %s, - `percent_positive` = %s, - `percent_a` = %s, - `percent_b` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a'], row['total_b'], - row['percent_positive'], row['percent_a'], row['percent_b'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, CL_TABLE) + print("rows before: %d" % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print("loading data from %s as issued on %d" % (filename, issue)) + rows = load_zipped_csv(filename, CL_SHEET) + print(" loaded %d rows" % len(rows)) + data = [get_clinical_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(" found %d entries" % len(entries)) + + sql = """ + INSERT INTO + `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, + `percent_b`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a` = %s, + `total_b` = %s, + `percent_positive` = %s, + `percent_a` = %s, + `percent_b` = %s + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [ + row["total_specimens"], + row["total_a"], + row["total_b"], + row["percent_positive"], + row["percent_a"], + row["percent_b"], + ] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + cnx.close() + def update_from_file_public(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, PHL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, PHL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_public_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, - `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a_h1n1` = %s, - `total_a_h3` = %s, - `total_a_h3n2v` = %s, - `total_a_no_sub` = %s, - `total_b` = %s, - `total_b_vic` = %s, - `total_b_yam` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a_h1n1'], row['total_a_h3'], - row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], - row['total_b_vic'], row['total_b_yam'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, PHL_TABLE) + print("rows before: %d" % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print("loading data from %s as issued on %d" % (filename, issue)) + rows = load_zipped_csv(filename, PHL_SHEET) + print(" loaded %d rows" % len(rows)) + data = [get_public_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(" found %d entries" % len(entries)) + + sql = """ + INSERT INTO + `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, + `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a_h1n1` = %s, + `total_a_h3` = %s, + `total_a_h3n2v` = %s, + `total_a_no_sub` = %s, + `total_b` = %s, + `total_b_vic` = %s, + `total_b_yam` = %s + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [ + row["total_specimens"], + row["total_a_h1n1"], + row["total_a_h3"], + row["total_a_h3n2v"], + row["total_a_no_sub"], + row["total_b"], + row["total_b_vic"], + row["total_b_yam"], + ] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + cnx.close() + def update_from_file(issue, date, filename, test_mode=False): - """ - Read ILINet data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename) - print(' loaded %d rows' % len(rows)) - data = [get_ilinet_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, - `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, - `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `num_ili` = %s, - `num_patients` = %s, - `num_providers` = %s, - `wili` = %s, - `ili` = %s, - `num_age_0` = coalesce(%s, `num_age_0`), - `num_age_1` = coalesce(%s, `num_age_1`), - `num_age_2` = coalesce(%s, `num_age_2`), - `num_age_3` = coalesce(%s, `num_age_3`), - `num_age_4` = coalesce(%s, `num_age_4`), - `num_age_5` = coalesce(%s, `num_age_5`) - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['n_ili'], row['n_patients'], row['n_providers'], row['wili'], - row['ili'], row['age0'], row['age1'], row['age2'], row['age3'], - row['age4'], row['age5'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read ILINet data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx) + print("rows before: %d" % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print("loading data from %s as issued on %d" % (filename, issue)) + rows = load_zipped_csv(filename) + print(" loaded %d rows" % len(rows)) + data = [get_ilinet_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(" found %d entries" % len(entries)) + + sql = """ + INSERT INTO + `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, + `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, + `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `num_ili` = %s, + `num_patients` = %s, + `num_providers` = %s, + `wili` = %s, + `ili` = %s, + `num_age_0` = coalesce(%s, `num_age_0`), + `num_age_1` = coalesce(%s, `num_age_1`), + `num_age_2` = coalesce(%s, `num_age_2`), + `num_age_3` = coalesce(%s, `num_age_3`), + `num_age_4` = coalesce(%s, `num_age_4`), + `num_age_5` = coalesce(%s, `num_age_5`) + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [ + row["n_ili"], + row["n_patients"], + row["n_providers"], + row["wili"], + row["ili"], + row["age0"], + row["age1"], + row["age2"], + row["age3"], + row["age4"], + row["age5"], + ] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + cnx.close() + def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' - ) - parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' - ) - args = parser.parse_args() - - if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') - - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) - - if args.file: - update_from_file(args.issue, date, args.file, test_mode=args.test) - update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(args.issue, date, args.file, test_mode=args.test) - else: - issue, files = fluview.save_latest(path='flu_data') - for filename in files: - update_from_file(issue, date, filename, test_mode=args.test) - update_from_file_clinical(issue, date, filename, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(issue, date, filename, test_mode=args.test) - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "--test", action="store_true", help="do dry run only, do not update the database" + ) + parser.add_argument( + "--file", type=str, help="load an existing zip file (otherwise fetch current data)" + ) + parser.add_argument( + "--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given" + ) + args = parser.parse_args() + + if (args.file is None) != (args.issue is None): + raise Exception("--file and --issue must both be present or absent") + + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) + + if args.file: + update_from_file(args.issue, date, args.file, test_mode=args.test) + update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(args.issue, date, args.file, test_mode=args.test) + else: + issue, files = fluview.save_latest(path="flu_data") + for filename in files: + update_from_file(issue, date, filename, test_mode=args.test) + update_from_file_clinical(issue, date, filename, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(issue, date, filename, test_mode=args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index 7f9a23231..230dd2f7d 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -59,290 +59,283 @@ class Database: - """Database wrapper and abstraction layer.""" - - class Sql: - """Container for SQL constants.""" - - # Count the total number of imputed rows. - count_rows = ''' - SELECT - count(1) `num` - FROM - `fluview_imputed` - ''' - - # Find (issue, epiweek) pairs that exist in table `fluview` but not in - # table `fluview_imputed`. Note that only issues >= 201740 are selected - # because that's when CDC first started posting state-level ILINet data. - # This assumes that `fluview` is always missing at least one location. - find_missing_rows = ''' - SELECT - fv.`issue`, fv.`epiweek` - FROM ( + """Database wrapper and abstraction layer.""" + + class Sql: + """Container for SQL constants.""" + + # Count the total number of imputed rows. + count_rows = """ SELECT - `issue`, `epiweek` + count(1) `num` FROM - `fluview` + `fluview_imputed` + """ + + # Find (issue, epiweek) pairs that exist in table `fluview` but not in + # table `fluview_imputed`. Note that only issues >= 201740 are selected + # because that's when CDC first started posting state-level ILINet data. + # This assumes that `fluview` is always missing at least one location. + find_missing_rows = """ + SELECT + fv.`issue`, fv.`epiweek` + FROM ( + SELECT + `issue`, `epiweek` + FROM + `fluview` + WHERE + `issue` >= 201740 + GROUP BY + `issue`, `epiweek` + ) fv + LEFT JOIN ( + SELECT + `issue`, `epiweek` + FROM + `fluview_imputed` + GROUP BY + `issue`, `epiweek` + ) fvi + ON + fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` WHERE - `issue` >= 201740 - GROUP BY - `issue`, `epiweek` - ) fv - LEFT JOIN ( + fvi.`issue` IS NULL + """ + + # Read all location rows from the `fluview` table for a given issue and + # epiweek. + get_known_values = """ SELECT - `issue`, `epiweek` + `region`, `num_ili`, `num_patients`, `num_providers` FROM - `fluview_imputed` - GROUP BY - `issue`, `epiweek` - ) fvi - ON - fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` - WHERE - fvi.`issue` IS NULL - ''' - - # Read all location rows from the `fluview` table for a given issue and - # epiweek. - get_known_values = ''' - SELECT - `region`, `num_ili`, `num_patients`, `num_providers` - FROM - `fluview` - WHERE - `issue` = %s AND `epiweek` = %s - ''' - - # Insert location rows into the `fluview_imputed` table for a given issue - # and epiweek. - add_imputed_values = ''' - INSERT INTO - `fluview_imputed` ( - `issue`, - `epiweek`, - `region`, - `lag`, - `num_ili`, - `num_patients`, - `num_providers`, - `ili` - ) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s) - ''' - - def connect(self): - """Connect to the database.""" - u, p = secrets.db.epi - self.cnx = mysql.connector.connect(user=u, password=p, database='epidata') - self.cur = self.cnx.cursor() - - def close(self, commit): - """ - Close the connection to the database, committing or rolling back changes as - indicated. - """ - self.cur.close() - if commit: - self.cnx.commit() - else: - print('test mode, not committing') - self.cnx.close() - - def count_rows(self): - """Count and return the number of rows in the `fluview_imputed` table.""" - self.cur.execute(Database.Sql.count_rows) - for (num,) in self.cur: - return num - - def find_missing_rows(self): - """ - Find rows that still have missing values. Each missing row is uniquely - identified by an (issue, epiweek, location) tuple. This function finds the - first two. - """ + `fluview` + WHERE + `issue` = %s AND `epiweek` = %s + """ + + # Insert location rows into the `fluview_imputed` table for a given issue + # and epiweek. + add_imputed_values = """ + INSERT INTO + `fluview_imputed` ( + `issue`, + `epiweek`, + `region`, + `lag`, + `num_ili`, + `num_patients`, + `num_providers`, + `ili` + ) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s) + """ + + def connect(self): + """Connect to the database.""" + u, p = secrets.db.epi + self.cnx = mysql.connector.connect(user=u, password=p, database="epidata") + self.cur = self.cnx.cursor() + + def close(self, commit): + """ + Close the connection to the database, committing or rolling back changes as + indicated. + """ + self.cur.close() + if commit: + self.cnx.commit() + else: + print("test mode, not committing") + self.cnx.close() + + def count_rows(self): + """Count and return the number of rows in the `fluview_imputed` table.""" + self.cur.execute(Database.Sql.count_rows) + for (num,) in self.cur: + return num + + def find_missing_rows(self): + """ + Find rows that still have missing values. Each missing row is uniquely + identified by an (issue, epiweek, location) tuple. This function finds the + first two. + """ + + self.cur.execute(Database.Sql.find_missing_rows) + return [(issue, epiweek) for (issue, epiweek) in self.cur] + + def get_known_values(self, issue, epiweek): + """ + Fetch ILINet data for all locations available for the given issue and + epiweek. The returned value is a dict mapping from locations to ILI data. + """ + + self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) + return {loc: (n_ili, n_pat, n_prov) for (loc, n_ili, n_pat, n_prov) in self.cur} + + def add_imputed_values(self, issue, epiweek, imputed): + """ + Store imputed ILINet data for the given locations on the given issue and + epiweek. The imputed value is a dict mapping from locations to ILI data. + """ + + for loc in imputed.keys(): + lag, n_ili, n_pat, n_prov, ili = imputed[loc] + args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) + self.cur.execute(Database.Sql.add_imputed_values, args) - self.cur.execute(Database.Sql.find_missing_rows) - return [(issue, epiweek) for (issue, epiweek) in self.cur] - def get_known_values(self, issue, epiweek): - """ - Fetch ILINet data for all locations available for the given issue and - epiweek. The returned value is a dict mapping from locations to ILI data. - """ +class StatespaceException(Exception): + """Used to indicate that imputation is not possible with the given inputs.""" - self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) - return dict([ - (loc, (n_ili, n_pat, n_prov)) - for - (loc, n_ili, n_pat, n_prov) - in self.cur - ]) - def add_imputed_values(self, issue, epiweek, imputed): +def get_location_graph(): """ - Store imputed ILINet data for the given locations on the given issue and - epiweek. The imputed value is a dict mapping from locations to ILI data. + Return a matrix where rows represent regions, columns represent atoms, and + each entry is a 1 if the region contains the atom, otherwise 0. The + corresponding lists of regions and atoms are also returned. """ - for loc in imputed.keys(): - lag, n_ili, n_pat, n_prov, ili = imputed[loc] - args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) - self.cur.execute(Database.Sql.add_imputed_values, args) - - -class StatespaceException(Exception): - """Used to indicate that imputation is not possible with the given inputs.""" - - -def get_location_graph(): - """ - Return a matrix where rows represent regions, columns represent atoms, and - each entry is a 1 if the region contains the atom, otherwise 0. The - corresponding lists of regions and atoms are also returned. - """ - - regions = sorted(Locations.region_list) - atoms = sorted(Locations.atom_list) - graph = np.zeros((len(regions), len(atoms))) - for i, r in enumerate(regions): - for a in Locations.region_map[r]: - j = atoms.index(a) - graph[i, j] = 1 - return graph, regions, atoms + regions = sorted(Locations.region_list) + atoms = sorted(Locations.atom_list) + graph = np.zeros((len(regions), len(atoms))) + for i, r in enumerate(regions): + for a in Locations.region_map[r]: + j = atoms.index(a) + graph[i, j] = 1 + return graph, regions, atoms def get_fusion_parameters(known_locations): - """ - Return a matrix that fuses known ILI values into unknown ILI values. The - corresponding lists of known and unknown locations are also returned. + """ + Return a matrix that fuses known ILI values into unknown ILI values. The + corresponding lists of known and unknown locations are also returned. - The goal is to infer ILI data in all locations, given ILI data in some - partial set of locations. This function takes a sensor fusion approach. + The goal is to infer ILI data in all locations, given ILI data in some + partial set of locations. This function takes a sensor fusion approach. - Let $z$ be a column vector of values in reported locations. Let $y$ be the - desired column vector of values in unreported locations. With matrices $H$ - (mapping from latent state to reported values), $W$ (mapping from latent - state to unreported values), and $R = I$ (covariance, which is identity): + Let $z$ be a column vector of values in reported locations. Let $y$ be the + desired column vector of values in unreported locations. With matrices $H$ + (mapping from latent state to reported values), $W$ (mapping from latent + state to unreported values), and $R = I$ (covariance, which is identity): - $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ - $y = W (H^T H)^{-1} H^T z$ + $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ + $y = W (H^T H)^{-1} H^T z$ - This is equavalent to OLS regression with an added translation from atomic - locations to missing locations. Unknown values are computed as a linear - combination of known values. - """ + This is equavalent to OLS regression with an added translation from atomic + locations to missing locations. Unknown values are computed as a linear + combination of known values. + """ - graph, regions, atoms = get_location_graph() - is_known = np.array([r in known_locations for r in regions]) - is_unknown = np.logical_not(is_known) - if not np.any(is_known): - raise StatespaceException('no values are known') - if not np.any(is_unknown): - raise StatespaceException('no values are unknown') + graph, regions, atoms = get_location_graph() + is_known = np.array([r in known_locations for r in regions]) + is_unknown = np.logical_not(is_known) + if not np.any(is_known): + raise StatespaceException("no values are known") + if not np.any(is_unknown): + raise StatespaceException("no values are unknown") - H = graph[is_known, :] - W = graph[is_unknown, :] - if np.linalg.matrix_rank(H) != len(atoms): - raise StatespaceException('system is underdetermined') + H = graph[is_known, :] + W = graph[is_unknown, :] + if np.linalg.matrix_rank(H) != len(atoms): + raise StatespaceException("system is underdetermined") - HtH = np.dot(H.T, H) - HtH_inv = np.linalg.inv(HtH) - H_pseudo_inv = np.dot(HtH_inv, H.T) - fuser = np.dot(W, H_pseudo_inv) + HtH = np.dot(H.T, H) + HtH_inv = np.linalg.inv(HtH) + H_pseudo_inv = np.dot(HtH_inv, H.T) + fuser = np.dot(W, H_pseudo_inv) - locations = np.array(regions) - filter_locations = lambda selected: list(map(str, locations[selected])) - return fuser, filter_locations(is_known), filter_locations(is_unknown) + locations = np.array(regions) + filter_locations = lambda selected: list(map(str, locations[selected])) + return fuser, filter_locations(is_known), filter_locations(is_unknown) def get_lag_and_ili(issue, epiweek, num_ili, num_patients): - """ - Compute and return reporting lag and percent ILI from imputed ILINet data. - """ - lag = delta_epiweeks(epiweek, issue) - ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) - return lag, ili + """ + Compute and return reporting lag and percent ILI from imputed ILINet data. + """ + lag = delta_epiweeks(epiweek, issue) + ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) + return lag, ili def impute_missing_values(database, test_mode=False): - """ - Determine whether values are missing for any states and territories. If so, - impute them and store them in the database. - """ - - # database connection - database.connect() - rows1 = database.count_rows() - print('rows before: %d' % (rows1)) - - # iterate over missing epiweeks - missing_rows = database.find_missing_rows() - print('missing data for %d epiweeks' % len(missing_rows)) - for issue, epiweek in missing_rows: - print('i=%d e=%d' % (issue, epiweek)) - - # get known values from table `fluview` - known_values = database.get_known_values(issue, epiweek) - - # Unlike most other state-level data, which typically begins publicly on - # 2010w40, data for PR begins on 2013w40. Before this, there are no reports - # for PR. Here we assume that no report is equivalent to a report of all - # zeros (number of ILI, patients, and providers). That's mostly true, with - # the notable exception of wILI, but that's not relevant here. By assuming - # that PR reports zero on those weeks, it's possible to impute values for - # VI, which are otherwise not reported until 2015w40. - assume_pr_zero = epiweek < 201340 and 'pr' not in known_values - if assume_pr_zero: - known_values['pr'] = (0, 0, 0) - - # get the imputation matrix and lists of known and unknown locations - F, known, unknown = get_fusion_parameters(known_values.keys()) - - # finally, impute the missing values - z = np.array([known_values[k] for k in known]) - y = np.dot(F, z) - - # possibly also record the assumptions made for PR - if assume_pr_zero: - unknown.append('pr') - y = np.vstack((y, [known_values['pr']])) - - # add lag and percent ILI to the data for each imputed location - imputed_values = {} - for loc, values in zip(unknown, y): - n_ili, n_pat, n_prov = map(int, np.rint(values)) - lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) - imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) - print(' %s: %s' % (loc, str(imputed_values[loc]))) - - # save all imputed values in table `fluview_imputed` - database.add_imputed_values(issue, epiweek, imputed_values) - - # database cleanup - rows2 = database.count_rows() - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - commit = not test_mode - database.close(commit) + """ + Determine whether values are missing for any states and territories. If so, + impute them and store them in the database. + """ + + # database connection + database.connect() + rows1 = database.count_rows() + print("rows before: %d" % (rows1)) + + # iterate over missing epiweeks + missing_rows = database.find_missing_rows() + print("missing data for %d epiweeks" % len(missing_rows)) + for issue, epiweek in missing_rows: + print("i=%d e=%d" % (issue, epiweek)) + + # get known values from table `fluview` + known_values = database.get_known_values(issue, epiweek) + + # Unlike most other state-level data, which typically begins publicly on + # 2010w40, data for PR begins on 2013w40. Before this, there are no reports + # for PR. Here we assume that no report is equivalent to a report of all + # zeros (number of ILI, patients, and providers). That's mostly true, with + # the notable exception of wILI, but that's not relevant here. By assuming + # that PR reports zero on those weeks, it's possible to impute values for + # VI, which are otherwise not reported until 2015w40. + assume_pr_zero = epiweek < 201340 and "pr" not in known_values + if assume_pr_zero: + known_values["pr"] = (0, 0, 0) + + # get the imputation matrix and lists of known and unknown locations + F, known, unknown = get_fusion_parameters(known_values.keys()) + + # finally, impute the missing values + z = np.array([known_values[k] for k in known]) + y = np.dot(F, z) + + # possibly also record the assumptions made for PR + if assume_pr_zero: + unknown.append("pr") + y = np.vstack((y, [known_values["pr"]])) + + # add lag and percent ILI to the data for each imputed location + imputed_values = {} + for loc, values in zip(unknown, y): + n_ili, n_pat, n_prov = map(int, np.rint(values)) + lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) + imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) + print(f" {loc}: {str(imputed_values[loc])}") + + # save all imputed values in table `fluview_imputed` + database.add_imputed_values(issue, epiweek, imputed_values) + + # database cleanup + rows2 = database.count_rows() + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + commit = not test_mode + database.close(commit) def get_argument_parser(): - """Set up command line arguments and usage.""" - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - return parser + """Set up command line arguments and usage.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--test", action="store_true", help="do dry run only, do not update the database" + ) + return parser def main(): - """Run this script from the command line.""" - args = get_argument_parser().parse_args() - impute_missing_values(Database(), test_mode=args.test) + """Run this script from the command line.""" + args = get_argument_parser().parse_args() + impute_missing_values(Database(), test_mode=args.test) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() From b8900a0bc846888885310911efd6e26459effa99 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:53:39 -0700 Subject: [PATCH 23/43] style(black): format ght acquisition --- src/acquisition/ght/ght_update.py | 587 ++++++++++---------- src/acquisition/ght/google_health_trends.py | 215 +++---- 2 files changed, 417 insertions(+), 385 deletions(-) diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index c1e9b8d94..76046c5c4 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -63,7 +63,7 @@ * fixed multiple-word queries (surround with quotes) 2015-12-01 * Original version -''' +""" # standard library import argparse @@ -88,304 +88,325 @@ # 2010-04-19 and 2015-05-05 # see: https://www.google.com/trends/correlate TERMS = [ - '/m/0cycc', - 'influenza type a', - 'flu duration', - 'flu fever', - 'treating flu', - 'fever flu', - 'flu recovery', - 'braun thermoscan', - 'oscillococcinum', - 'treating the flu', - 'cold or flu', - 'flu versus cold', - 'flu remedies', - 'contagious flu', - 'type a influenza', - 'flu or cold', - 'duration of flu', - 'cold versus flu', - 'flu cough', - 'flu headache', - 'thermoscan', - 'influenza incubation period', - 'flu lasts', - 'length of flu', - 'flu stomach', - 'cold vs flu', - 'flu and fever', - 'getting over the flu', - 'influenza a', - 'treatment for flu', - 'flu length', - 'treatment for the flu', - 'influenza symptoms', - 'over the counter flu', - 'flu complications', - 'cold and flu symptoms', - 'influenza incubation', - 'treatment of flu', - 'human temperature', - 'low body', - 'flu contagious', - 'robitussin ac', - 'flu how long', - 'ear thermometer', - 'flu contagious period', - 'treat flu', - 'cough flu', - 'low body temperature', - 'expectorant', - 'flu and cold', - 'rapid flu', - 'flu vs. cold', - 'how to treat the flu', - 'how long does the flu last?', - 'viral pneumonia', - 'flu in kids', - 'type a flu', - 'influenza treatment', - 'fighting the flu', - 'flu relief', - 'treat the flu', - 'flu medicine', - 'dangerous fever', - 'what is influenza', - 'tussin', - 'low body temp', - 'flu care', - 'flu in infants', - 'flu dizziness', - 'feed a fever', - 'flu vs cold', - 'flu vomiting', - 'bacterial pneumonia', - 'flu activity', - 'flu chills', - 'anas barbariae', - 'flu germs', - 'tylenol cold', - 'how to get over the flu', - 'flu in children', - 'influenza a and b', - 'duration of the flu', - 'cold symptoms', - 'flu report', - 'rapid flu test', - 'flu relapse', - 'get over the flu', - 'flu during pregnancy', - 'flu recovery time', - 'cure for flu', - 'tamiflu and breastfeeding', - 'flu chest pain', - 'flu treatment', - 'flu nausea', - 'remedies for the flu', - 'tamiflu in pregnancy', - 'side effects of tamiflu', - 'how to treat flu', - 'viral bronchitis', - 'flu how long contagious', - 'flu remedy', + "/m/0cycc", + "influenza type a", + "flu duration", + "flu fever", + "treating flu", + "fever flu", + "flu recovery", + "braun thermoscan", + "oscillococcinum", + "treating the flu", + "cold or flu", + "flu versus cold", + "flu remedies", + "contagious flu", + "type a influenza", + "flu or cold", + "duration of flu", + "cold versus flu", + "flu cough", + "flu headache", + "thermoscan", + "influenza incubation period", + "flu lasts", + "length of flu", + "flu stomach", + "cold vs flu", + "flu and fever", + "getting over the flu", + "influenza a", + "treatment for flu", + "flu length", + "treatment for the flu", + "influenza symptoms", + "over the counter flu", + "flu complications", + "cold and flu symptoms", + "influenza incubation", + "treatment of flu", + "human temperature", + "low body", + "flu contagious", + "robitussin ac", + "flu how long", + "ear thermometer", + "flu contagious period", + "treat flu", + "cough flu", + "low body temperature", + "expectorant", + "flu and cold", + "rapid flu", + "flu vs. cold", + "how to treat the flu", + "how long does the flu last?", + "viral pneumonia", + "flu in kids", + "type a flu", + "influenza treatment", + "fighting the flu", + "flu relief", + "treat the flu", + "flu medicine", + "dangerous fever", + "what is influenza", + "tussin", + "low body temp", + "flu care", + "flu in infants", + "flu dizziness", + "feed a fever", + "flu vs cold", + "flu vomiting", + "bacterial pneumonia", + "flu activity", + "flu chills", + "anas barbariae", + "flu germs", + "tylenol cold", + "how to get over the flu", + "flu in children", + "influenza a and b", + "duration of the flu", + "cold symptoms", + "flu report", + "rapid flu test", + "flu relapse", + "get over the flu", + "flu during pregnancy", + "flu recovery time", + "cure for flu", + "tamiflu and breastfeeding", + "flu chest pain", + "flu treatment", + "flu nausea", + "remedies for the flu", + "tamiflu in pregnancy", + "side effects of tamiflu", + "how to treat flu", + "viral bronchitis", + "flu how long contagious", + "flu remedy", ] # a list of all US states, including DC and the US as a whole LOCATIONS = [ - 'US', - 'AL', - 'AK', - 'AZ', - 'AR', - 'CA', - 'CO', - 'CT', - 'DC', - 'DE', - 'FL', - 'GA', - 'HI', - 'ID', - 'IL', - 'IN', - 'IA', - 'KS', - 'KY', - 'LA', - 'ME', - 'MD', - 'MA', - 'MI', - 'MN', - 'MS', - 'MO', - 'MT', - 'NE', - 'NV', - 'NH', - 'NJ', - 'NM', - 'NY', - 'NC', - 'ND', - 'OH', - 'OK', - 'OR', - 'PA', - 'RI', - 'SC', - 'SD', - 'TN', - 'TX', - 'UT', - 'VT', - 'VA', - 'WA', - 'WV', - 'WI', - 'WY', + "US", + "AL", + "AK", + "AZ", + "AR", + "CA", + "CO", + "CT", + "DC", + "DE", + "FL", + "GA", + "HI", + "ID", + "IL", + "IN", + "IA", + "KS", + "KY", + "LA", + "ME", + "MD", + "MA", + "MI", + "MN", + "MS", + "MO", + "MT", + "NE", + "NV", + "NH", + "NJ", + "NM", + "NY", + "NC", + "ND", + "OH", + "OK", + "OR", + "PA", + "RI", + "SC", + "SD", + "TN", + "TX", + "UT", + "VT", + "VA", + "WA", + "WV", + "WI", + "WY", ] -def update(locations, terms, first=None, last=None, countries=['US']): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() +def update(locations, terms, first=None, last=None, countries=["US"]): + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `ght`') - for (num,) in cur: - pass - return num + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `ght`") + for (num,) in cur: + pass + return num - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print("Checking epiweeks between %d and %d..." % (ew0, ew1)) - # keep track of how many rows were added - rows_before = get_num_rows() + # keep track of how many rows were added + rows_before = get_num_rows() - # check Google Trends for new and/or revised data - sql = ''' + # check Google Trends for new and/or revised data + sql = """ INSERT INTO `ght` (`query`, `location`, `epiweek`, `value`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s - ''' - total_rows = 0 - ght = GHT(API_KEY) - for term in terms: - print(' [%s] using term' % term) - ll, cl = len(locations), len(countries) - for i in range(max(ll,cl)): - location = locations[i] if i < ll else locations[0] - country = countries[i] if i < cl else countries[0] - try: - #term2 = ('"%s"' % term) if ' ' in term else term - term2 = term - attempt = 0 - while True: - attempt += 1 - try: - result = ght.get_data(ew0, ew1, location, term2, country=country) - break - except Exception as ex: - if attempt >= 5: - raise ex - else: - delay = 2 ** attempt - print(' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) - time.sleep(delay) - values = [p['value'] for p in result['data']['lines'][0]['points']] - ew = result['start_week'] - num_missing = 0 - for v in values: - # Default SQL location value for US country for backwards compatibility - # i.e. California's location is still stored as 'CA', - # and having location == 'US' is still stored as 'US' - sql_location = location if location != NO_LOCATION_STR else country - - # Change SQL location for non-US countries - if country != 'US': - # Underscore added to distinguish countries from 2-letter US states - sql_location = country + "_" - if location != NO_LOCATION_STR: - sql_location = sql_location + location - sql_data = (term, sql_location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - #print(' [%s|%s|%d] missing value' % (term, location, ew)) - ew = flu.add_epiweeks(ew, 1) - if num_missing > 0: - print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) - except Exception as ex: - print(' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + """ + total_rows = 0 + ght = GHT(API_KEY) + for term in terms: + print(" [%s] using term" % term) + ll, cl = len(locations), len(countries) + for i in range(max(ll, cl)): + location = locations[i] if i < ll else locations[0] + country = countries[i] if i < cl else countries[0] + try: + # term2 = ('"%s"' % term) if ' ' in term else term + term2 = term + attempt = 0 + while True: + attempt += 1 + try: + result = ght.get_data(ew0, ew1, location, term2, country=country) + break + except Exception as ex: + if attempt >= 5: + raise ex + else: + delay = 2**attempt + print( + " [%s|%s] caught exception (will retry in %ds):" + % (term, location, delay), + ex, + ) + time.sleep(delay) + values = [p["value"] for p in result["data"]["lines"][0]["points"]] + ew = result["start_week"] + num_missing = 0 + for v in values: + # Default SQL location value for US country for backwards compatibility + # i.e. California's location is still stored as 'CA', + # and having location == 'US' is still stored as 'US' + sql_location = location if location != NO_LOCATION_STR else country + + # Change SQL location for non-US countries + if country != "US": + # Underscore added to distinguish countries from 2-letter US states + sql_location = country + "_" + if location != NO_LOCATION_STR: + sql_location = sql_location + location + sql_data = (term, sql_location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + # print(' [%s|%s|%d] missing value' % (term, location, ew)) + ew = flu.add_epiweeks(ew, 1) + if num_missing > 0: + print( + " [%s|%s] missing %d/%d value(s)" + % (term, location, num_missing, len(values)) + ) + except Exception as ex: + print(f" [{term}|{location}] caught exception (will NOT retry):", ex) + + # keep track of how many rows were added + rows_after = get_num_rows() + print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('location', action='store', type=str, default=None, help='location(s) (ex: all; US; TX; CA,LA,WY)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--country', '-c', default='US', type=str, help='location country (ex: US; BR)') - args = parser.parse_args() - - # sanity check - first, last = args.first, args.last - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - elif args.location.lower() == 'none': - locations = [NO_LOCATION_STR] - else: - locations = args.location.upper().split(',') - if args.term.lower() == 'all': - terms = TERMS - else: - terms = [args.term] - - # country argument - # Check that country follows ISO 1366 Alpha-2 code. - # See https://www.iso.org/obp/ui/#search. - countries = args.country.upper().split(',') - if not all(map(lambda x: len(x) == 2, countries)): - raise Exception('country name must be two letters (ISO 1366 Alpha-2)') - - # if length of locations and countries is > 1, need to be the same - if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): - raise Exception('locations and countries must be length 1, or same length') - - # run the update - update(locations, terms, first, last, countries) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "location", + action="store", + type=str, + default=None, + help="location(s) (ex: all; US; TX; CA,LA,WY)", + ) + parser.add_argument( + "term", + action="store", + type=str, + default=None, + help='term/query/topic (ex: all; /m/0cycc; "flu fever")', + ) + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument( + "--country", "-c", default="US", type=str, help="location country (ex: US; BR)" + ) + args = parser.parse_args() + + # sanity check + first, last = args.first, args.last + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + + # decide what to update + if args.location.lower() == "all": + locations = LOCATIONS + elif args.location.lower() == "none": + locations = [NO_LOCATION_STR] + else: + locations = args.location.upper().split(",") + if args.term.lower() == "all": + terms = TERMS + else: + terms = [args.term] + + # country argument + # Check that country follows ISO 1366 Alpha-2 code. + # See https://www.iso.org/obp/ui/#search. + countries = args.country.upper().split(",") + if not all(map(lambda x: len(x) == 2, countries)): + raise Exception("country name must be two letters (ISO 1366 Alpha-2)") + + # if length of locations and countries is > 1, need to be the same + if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): + raise Exception("locations and countries must be length 1, or same length") + + # run the update + update(locations, terms, first, last, countries) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 66a11c227..7fd95f9a4 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -18,7 +18,7 @@ + sample command line usage + extract array of values from returned data * separated GHT class from ght_update.py -''' +""" # standard library import argparse @@ -31,109 +31,120 @@ from delphi.utils.epidate import EpiDate import delphi.utils.epiweek as flu -NO_LOCATION_STR = 'none' +NO_LOCATION_STR = "none" + class GHT: - # Google Trends API endpoint - DISCOVERY_URL = 'https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest' - - def __init__(self, key, delay=1): - self.service = build('trends', 'v1beta', developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL) - self.delay = delay - - # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) - @staticmethod - def _ew2date(ew): - # parse the epiweek - year, week = flu.split_epiweek(ew) - # get the date object (middle of the week; Wednesday) - date = EpiDate.from_epiweek(year, week) - # go to the first day of the week (Sunday) - date = date.add_days(-3) - # date as string - return str(date) - - # get data from Google APIs - # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth - def get_data(self, start_week, end_week, location, term, resolution='week', country='US'): - start_date = GHT._ew2date(start_week) - end_date = GHT._ew2date(end_week) - num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 - - # getTimelinesForHealth parameters - params = { - 'terms': term, - 'time_startDate': start_date, - 'time_endDate': end_date, - 'timelineResolution': resolution, - } - # We have a special check for the US for backwards compatibility. - # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. - # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. - if country == 'US': - if location == 'US' or location == NO_LOCATION_STR: - params['geoRestriction_country'] = 'US' - else: - params['geoRestriction_region'] = 'US-' + location - else: - if location == NO_LOCATION_STR: - params['geoRestriction_country'] = country - else: - params['geoRestriction_region'] = country + '-' + location - - # make the API call - data = self.service.getTimelinesForHealth(**params).execute() - - # extract the values - try: - values = [p['value'] for p in data['lines'][0]['points']] - except: - values = None - - # throttle request rate - time.sleep(self.delay) - - # return the results - return { - 'start_week': start_week, - 'end_week': end_week, - 'num_weeks': num_weeks, - 'location': location, - 'country' : country, - 'term': term, - 'resolution': resolution, - 'data': data, - 'values': values, - } + # Google Trends API endpoint + DISCOVERY_URL = "https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest" + + def __init__(self, key, delay=1): + self.service = build( + "trends", "v1beta", developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL + ) + self.delay = delay + + # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) + @staticmethod + def _ew2date(ew): + # parse the epiweek + year, week = flu.split_epiweek(ew) + # get the date object (middle of the week; Wednesday) + date = EpiDate.from_epiweek(year, week) + # go to the first day of the week (Sunday) + date = date.add_days(-3) + # date as string + return str(date) + + # get data from Google APIs + # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth + def get_data(self, start_week, end_week, location, term, resolution="week", country="US"): + start_date = GHT._ew2date(start_week) + end_date = GHT._ew2date(end_week) + num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 + + # getTimelinesForHealth parameters + params = { + "terms": term, + "time_startDate": start_date, + "time_endDate": end_date, + "timelineResolution": resolution, + } + # We have a special check for the US for backwards compatibility. + # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. + # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. + if country == "US": + if location == "US" or location == NO_LOCATION_STR: + params["geoRestriction_country"] = "US" + else: + params["geoRestriction_region"] = "US-" + location + else: + if location == NO_LOCATION_STR: + params["geoRestriction_country"] = country + else: + params["geoRestriction_region"] = country + "-" + location + + # make the API call + data = self.service.getTimelinesForHealth(**params).execute() + + # extract the values + try: + values = [p["value"] for p in data["lines"][0]["points"]] + except: + values = None + + # throttle request rate + time.sleep(self.delay) + + # return the results + return { + "start_week": start_week, + "end_week": end_week, + "num_weeks": num_weeks, + "location": location, + "country": country, + "term": term, + "resolution": resolution, + "data": data, + "values": values, + } def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('apikey', action='store', type=str, default=None, help='API key') - parser.add_argument('startweek', action='store', type=int, default=None, help='first week (ex: 201440)') - parser.add_argument('endweek', action='store', type=int, default=None, help='last week (ex: 201520)') - parser.add_argument('location', action='store', type=str, default=None, help='location (ex: US)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: /m/0cycc)') - args = parser.parse_args() - - # get the data - ght = GHT(args.apikey) - result = ght.get_data(args.startweek, args.endweek, args.location, args.term) - values = result['values'] - - # sanity check - expected_weeks = result['num_weeks'] - received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) - if expected_weeks != received_weeks: - raise Exception('expected %d weeks, received %d' % (expected_weeks, received_weeks)) - - # results - epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] - for (epiweek, value) in zip(epiweeks, values): - print('%6d: %.3f' % (epiweek, value)) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("apikey", action="store", type=str, default=None, help="API key") + parser.add_argument( + "startweek", action="store", type=int, default=None, help="first week (ex: 201440)" + ) + parser.add_argument( + "endweek", action="store", type=int, default=None, help="last week (ex: 201520)" + ) + parser.add_argument( + "location", action="store", type=str, default=None, help="location (ex: US)" + ) + parser.add_argument( + "term", action="store", type=str, default=None, help="term/query/topic (ex: /m/0cycc)" + ) + args = parser.parse_args() + + # get the data + ght = GHT(args.apikey) + result = ght.get_data(args.startweek, args.endweek, args.location, args.term) + values = result["values"] + + # sanity check + expected_weeks = result["num_weeks"] + received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) + if expected_weeks != received_weeks: + raise Exception("expected %d weeks, received %d" % (expected_weeks, received_weeks)) + + # results + epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] + for (epiweek, value) in zip(epiweeks, values): + print("%6d: %.3f" % (epiweek, value)) + + +if __name__ == "__main__": + main() From a849384c884934b3b7c3c67b68aa6240277d6b6d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:54:09 -0700 Subject: [PATCH 24/43] style(black): format kcdc acquisition --- src/acquisition/kcdc/kcdc_update.py | 90 ++++++++++++++++------------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index 70c167738..b2c12dba9 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -42,12 +42,14 @@ from delphi.utils.epiweek import delta_epiweeks, range_epiweeks, add_epiweeks from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `kcdc_ili` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -58,69 +60,76 @@ def ensure_tables_exist(): `ili` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='kcdc_ili'): - # Count and return the number of rows in the `kcdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="kcdc_ili"): + # Count and return the number of rows in the `kcdc_ili` table. + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def get_kcdc_data(): issue = EpiDate.today().get_ew() - last_season = issue//100 + (1 if issue % 100 > 35 else 0) - url = 'http://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do' + last_season = issue // 100 + (1 if issue % 100 > 35 else 0) + url = "https://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do" + # Started in 2004 params = { - 'icdNm': 'influenza', - 'startYear': '2004', # Started in 2004 - 'endYear': str(last_season) + "icdNm": "influenza", + "startYear": "2004", + "endYear": str(last_season), } response = requests.post(url, params) datas = response.json() - data = datas['data'] + data = datas["data"] ews = [] ilis = [] ew1 = 200436 - for year in range(2004,last_season): - year_data = data[year-2004] + for year in range(2004, last_season): + year_data = data[year - 2004] if year > 2004: ew1 = ews[-1] + 1 - ili_yr = year_data["VALUE"].split('`') - ili_yr = [float(f) for f in ili_yr if f != ''] - ew2 = add_epiweeks(ew1,len(ili_yr)) - new_ews = list(range_epiweeks(ew1,ew2)) + ili_yr = year_data["VALUE"].split("`") + ili_yr = [float(f) for f in ili_yr if f != ""] + ew2 = add_epiweeks(ew1, len(ili_yr)) + new_ews = list(range_epiweeks(ew1, ew2)) for i in range(len(new_ews)): j = float(ili_yr[i]) ilis.append(j) ews.append(new_ews[i]) return ews, ilis + def update_from_data(ews, ilis, date, issue, test_mode=False): u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) + print("rows before: %d" % (rows1)) insert = cnx.cursor() - sql = ''' + sql = """ INSERT INTO `kcdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `ili`) @@ -129,15 +138,15 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `ili` = %s - ''' + """ for i in range(len(ews)): ew = ews[i] ili = ilis[i] lag = delta_epiweeks(ews[i], issue) - insert_args = [date,issue,ew,'ROK',lag,ili] - update_args = [date,ili] + insert_args = [date, issue, ew, "ROK", lag, ili] + update_args = [date, ili] try: insert.execute(sql % tuple(insert_args + update_args)) except Exception: @@ -146,34 +155,33 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' + "--test", action="store_true", help="do dry run only, do not update the database" ) args = parser.parse_args() - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) issue = EpiDate.today().get_ew() ensure_tables_exist() - ews,ilis = get_kcdc_data() + ews, ilis = get_kcdc_data() update_from_data(ews, ilis, date, issue, test_mode=args.test) -if __name__ == '__main__': +if __name__ == "__main__": main() From d04af3c02fda7708a16bec0952b1aa7475acaec7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:04:35 -0700 Subject: [PATCH 25/43] style(black): format nidss acquisition --- src/acquisition/nidss/taiwan_nidss.py | 433 +++++++++++++------------ src/acquisition/nidss/taiwan_update.py | 162 +++++---- 2 files changed, 296 insertions(+), 299 deletions(-) diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index 27da863e1..57f4e272d 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -4,7 +4,7 @@ =============== Scrapes weekly flu data from Taiwan's National Infectious Disease Statistics -System (NIDSS): http://nidss.cdc.gov.tw/en/ +System (NIDSS): https://nidss.cdc.gov.tw/en/ ================= @@ -37,233 +37,234 @@ class NIDSS: - """An API for scraping the NIDSS site.""" + """An API for scraping the NIDSS site.""" - # The page where the flu data is kept - FLU_URL = 'https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh' + # The page where the flu data is kept + FLU_URL = "https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh" - # Link to the dengue data - DENGUE_URL = 'http://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv' + # Link to the dengue data + DENGUE_URL = "https://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv" - # Translate location names to English - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - _TRANSLATED = { - b'5Y2X5oqV57ij': 'Nantou_County', - b'5Y+w5Lit5biC': 'Taichung_City', - b'5Y+w5YyX5biC': 'Taipei_City', - b'5Y+w5Y2X5biC': 'Tainan_City', - b'5Y+w5p2x57ij': 'Taitung_County', - b'5ZiJ576p5biC': 'Chiayi_City', - b'5ZiJ576p57ij': 'Chiayi_County', - b'5Z+66ZqG5biC': 'Keelung_City', - b'5a6c6Jit57ij': 'Yilan_County', - b'5bGP5p2x57ij': 'Pingtung_County', - b'5b2w5YyW57ij': 'Changhua_County', - b'5paw5YyX5biC': 'New_Taipei_City', - b'5paw56u55biC': 'Hsinchu_City', - b'5paw56u557ij': 'Hsinchu_County', - b'5qGD5ZyS5biC': 'Taoyuan_City', - b'5r6O5rmW57ij': 'Penghu_County', - b'6Iqx6JOu57ij': 'Hualien_County', - b'6IuX5qCX57ij': 'Miaoli_County', - b'6YeR6ZaA57ij': 'Kinmen_County', - b'6Zuy5p6X57ij': 'Yunlin_County', - b'6auY6ZuE5biC': 'Kaohsiung_City', - b'6YCj5rGf57ij': 'Lienchiang_County', - } + # Translate location names to English + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + _TRANSLATED = { + b"5Y2X5oqV57ij": "Nantou_County", + b"5Y+w5Lit5biC": "Taichung_City", + b"5Y+w5YyX5biC": "Taipei_City", + b"5Y+w5Y2X5biC": "Tainan_City", + b"5Y+w5p2x57ij": "Taitung_County", + b"5ZiJ576p5biC": "Chiayi_City", + b"5ZiJ576p57ij": "Chiayi_County", + b"5Z+66ZqG5biC": "Keelung_City", + b"5a6c6Jit57ij": "Yilan_County", + b"5bGP5p2x57ij": "Pingtung_County", + b"5b2w5YyW57ij": "Changhua_County", + b"5paw5YyX5biC": "New_Taipei_City", + b"5paw56u55biC": "Hsinchu_City", + b"5paw56u557ij": "Hsinchu_County", + b"5qGD5ZyS5biC": "Taoyuan_City", + b"5r6O5rmW57ij": "Penghu_County", + b"6Iqx6JOu57ij": "Hualien_County", + b"6IuX5qCX57ij": "Miaoli_County", + b"6YeR6ZaA57ij": "Kinmen_County", + b"6Zuy5p6X57ij": "Yunlin_County", + b"6auY6ZuE5biC": "Kaohsiung_City", + b"6YCj5rGf57ij": "Lienchiang_County", + } - # Map locations to regions - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy - LOCATION_TO_REGION = { - # Taipei - 'Taipei_City': 'Taipei', - 'Keelung_City': 'Taipei', - 'New_Taipei_City': 'Taipei', - 'Yilan_County': 'Taipei', - 'Kinmen_County': 'Taipei', - 'Lienchiang_County': 'Taipei', - # Northern - 'Hsinchu_City': 'Northern', - 'Taoyuan_City': 'Northern', - 'Hsinchu_County': 'Northern', - 'Miaoli_County': 'Northern', - # Central - 'Taichung_City': 'Central', - 'Changhua_County': 'Central', - 'Nantou_County': 'Central', - # Southern - 'Tainan_City': 'Southern', - 'Chiayi_City': 'Southern', - 'Yunlin_County': 'Southern', - 'Chiayi_County': 'Southern', - # Kaoping - 'Kaohsiung_City': 'Kaoping', - 'Pingtung_County': 'Kaoping', - 'Penghu_County': 'Kaoping', - # Eastern - 'Hualien_County': 'Eastern', - 'Taitung_County': 'Eastern', - } + # Map locations to regions + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy + LOCATION_TO_REGION = { + # Taipei + "Taipei_City": "Taipei", + "Keelung_City": "Taipei", + "New_Taipei_City": "Taipei", + "Yilan_County": "Taipei", + "Kinmen_County": "Taipei", + "Lienchiang_County": "Taipei", + # Northern + "Hsinchu_City": "Northern", + "Taoyuan_City": "Northern", + "Hsinchu_County": "Northern", + "Miaoli_County": "Northern", + # Central + "Taichung_City": "Central", + "Changhua_County": "Central", + "Nantou_County": "Central", + # Southern + "Tainan_City": "Southern", + "Chiayi_City": "Southern", + "Yunlin_County": "Southern", + "Chiayi_County": "Southern", + # Kaoping + "Kaohsiung_City": "Kaoping", + "Pingtung_County": "Kaoping", + "Penghu_County": "Kaoping", + # Eastern + "Hualien_County": "Eastern", + "Taitung_County": "Eastern", + } - @staticmethod - def _get_metadata(html): - issue_pattern = re.compile('^.*Latest available data: Week (\\d+), (\\d{4})\\..*$') - release_pattern = re.compile('^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$') - issue, release = None, None - for line in html.split('\n'): - match = issue_pattern.match(line) - if match is not None: - year, week = int(match.group(2)), int(match.group(1)) - issue = year * 100 + week - match = release_pattern.match(line) - if match is not None: - year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) - release = '%04d-%02d-%02d' % (year, month, day) - if issue is None or release is None: - raise Exception('metadata not found') - return issue, release + @staticmethod + def _get_metadata(html): + issue_pattern = re.compile("^.*Latest available data: Week (\\d+), (\\d{4})\\..*$") + release_pattern = re.compile( + "^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$" + ) + issue, release = None, None + for line in html.split("\n"): + match = issue_pattern.match(line) + if match is not None: + year, week = int(match.group(2)), int(match.group(1)) + issue = year * 100 + week + match = release_pattern.match(line) + if match is not None: + year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) + release = "%04d-%02d-%02d" % (year, month, day) + if issue is None or release is None: + raise Exception("metadata not found") + return issue, release - @staticmethod - def _get_flu_data(html): - week_pattern = re.compile('^categories: \\[(.*)\\],$') - value_pattern = re.compile('^series: \\[(.*)\\],$') - data = {} - parsing_ili = True - for line in html.split('\n'): - line = line.strip() - match = week_pattern.match(line) - if match is not None: - weeks = [int(x[1:-1]) for x in match.group(1).split(',')] - for week in weeks: - check_epiweek(week) - if week not in data: - data[week] = {} - match = value_pattern.match(line) - if match is not None: - for item in match.group(1).split('},{'): - parts = item.replace('{', '').replace('}', '').strip().split(' ') - location = parts[1][1:-2] - def num(value): - if parsing_ili: - return float(value) - else: - if '.' in value: - raise Exception('expected type int for visits') - return int(value) - values = [num(x) for x in parts[3][1:-1].split(',')] - unit = 'ili' if parsing_ili else 'visits' - if len(weeks) != len(values): - raise Exception('len(weeks) != len(values)') - for week, value in zip(weeks, values): - if location not in data[week]: - data[week][location] = {} - data[week][location][unit] = value - parsing_ili = False - if len(data) == 0: - raise Exception('no data') - return data + @staticmethod + def _get_flu_data(html): + week_pattern = re.compile("^categories: \\[(.*)\\],$") + value_pattern = re.compile("^series: \\[(.*)\\],$") + data = {} + parsing_ili = True + for line in html.split("\n"): + line = line.strip() + match = week_pattern.match(line) + if match is not None: + weeks = [int(x[1:-1]) for x in match.group(1).split(",")] + for week in weeks: + check_epiweek(week) + if week not in data: + data[week] = {} + match = value_pattern.match(line) + if match is not None: + for item in match.group(1).split("},{"): + parts = item.replace("{", "").replace("}", "").strip().split(" ") + location = parts[1][1:-2] + + def num(value): + if parsing_ili: + return float(value) + else: + if "." in value: + raise Exception("expected type int for visits") + return int(value) - @staticmethod - def get_flu_data(): - # Fetch the flu page - response = requests.get(NIDSS.FLU_URL) - if response.status_code != 200: - raise Exception('request failed [%d]' % response.status_code) - html = response.text - # Parse metadata - latest_week, release_date = NIDSS._get_metadata(html) - # Parse flu data - data = NIDSS._get_flu_data(html) - # Return results indexed by week and location - return latest_week, release_date, data + values = [num(x) for x in parts[3][1:-1].split(",")] + unit = "ili" if parsing_ili else "visits" + if len(weeks) != len(values): + raise Exception("len(weeks) != len(values)") + for week, value in zip(weeks, values): + if location not in data[week]: + data[week][location] = {} + data[week][location][unit] = value + parsing_ili = False + if len(data) == 0: + raise Exception("no data") + return data - @staticmethod - def get_dengue_data(first_week, last_week): - # Check week order - if first_week > last_week: - first_week, last_week = last_week, first_week - # Bounds check - if first_week < 200301 or last_week < 200301: - raise Exception('week out of range') - # Initialize data by week and location (zeroes are not reported) - data = {} - for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): - data[week] = {} - for location in NIDSS.LOCATION_TO_REGION.keys(): - data[week][location] = 0 - # Download CSV - response = requests.get(NIDSS.DENGUE_URL) - if response.status_code != 200: - raise Exception('export Dengue failed [%d]' % response.status_code) - csv = response.content.decode('big5-tw') - # Parse the data - lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] - for line in lines: - fields = line.split(',') - location_b64 = base64.b64encode(fields[3].encode('utf-8')) - location = NIDSS._TRANSLATED[location_b64] - # Fields currently unused: - # region = NIDSS.LOCATION_TO_REGION[location] - # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) - # imported = imported_b64 == b'5piv' - # sex = fields[5] - # age = fields[7] - count = int(fields[8]) - year = int(fields[1]) - week = int(fields[2]) - # Week 53 was reported each year in 2003-2007 - if year < 2008 and year != 2003 and week > 52: - week = 52 - # Epiweek system change in 2009 - # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 - if year == 2009: - week -= 1 - if week == 0: - year, week = 2008, 53 - epiweek = year * 100 + week - if epiweek < first_week or epiweek > last_week: - # Outside of the requested range - continue - if epiweek not in data or location not in data[epiweek]: - # Not a vaild U.S. epiweek - raise Exception('data missing %d-%s' % (epiweek, location)) - # Add the counts to the location on this epiweek - data[epiweek][location] += count - # Return results indexed by week and location - return data + @staticmethod + def get_flu_data(): + # Fetch the flu page + response = requests.get(NIDSS.FLU_URL) + if response.status_code != 200: + raise Exception("request failed [%d]" % response.status_code) + html = response.text + # Parse metadata + latest_week, release_date = NIDSS._get_metadata(html) + # Parse flu data + data = NIDSS._get_flu_data(html) + # Return results indexed by week and location + return latest_week, release_date, data + + @staticmethod + def get_dengue_data(first_week, last_week): + # Check week order + if first_week > last_week: + first_week, last_week = last_week, first_week + # Bounds check + if first_week < 200301 or last_week < 200301: + raise Exception("week out of range") + # Initialize data by week and location (zeroes are not reported) + data = {} + for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): + data[week] = {} + for location in NIDSS.LOCATION_TO_REGION.keys(): + data[week][location] = 0 + # Download CSV + response = requests.get(NIDSS.DENGUE_URL) + if response.status_code != 200: + raise Exception("export Dengue failed [%d]" % response.status_code) + csv = response.content.decode("big5-tw") + # Parse the data + lines = [l.strip() for l in csv.split("\n")[1:] if l.strip() != ""] + for line in lines: + fields = line.split(",") + location_b64 = base64.b64encode(fields[3].encode("utf-8")) + location = NIDSS._TRANSLATED[location_b64] + # Fields currently unused: + # region = NIDSS.LOCATION_TO_REGION[location] + # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) + # imported = imported_b64 == b'5piv' + # sex = fields[5] + # age = fields[7] + count = int(fields[8]) + year = int(fields[1]) + week = int(fields[2]) + # Week 53 was reported each year in 2003-2007 + if year < 2008 and year != 2003 and week > 52: + week = 52 + # Epiweek system change in 2009 + # See also: https://research.undefinedx.com/forum/index.php?topic=300.0 + if year == 2009: + week -= 1 + if week == 0: + year, week = 2008, 53 + epiweek = year * 100 + week + if epiweek < first_week or epiweek > last_week: + # Outside of the requested range + continue + if epiweek not in data or location not in data[epiweek]: + # Not a vaild U.S. epiweek + raise Exception("data missing %d-%s" % (epiweek, location)) + # Add the counts to the location on this epiweek + data[epiweek][location] += count + # Return results indexed by week and location + return data def main(): - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'epiweek', - action='store', - type=int, - help='fetch data on this epiweek (ex: 201537)' - ) - args = parser.parse_args() - ew = args.epiweek + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "epiweek", action="store", type=int, help="fetch data on this epiweek (ex: 201537)" + ) + args = parser.parse_args() + ew = args.epiweek - # Get the data - latest_week, release_date, fdata = NIDSS.get_flu_data() - ddata = NIDSS.get_dengue_data(ew, ew) + # Get the data + latest_week, release_date, fdata = NIDSS.get_flu_data() + ddata = NIDSS.get_dengue_data(ew, ew) - # Print the results - print('*** Meta ***') - print('latest_week:', latest_week) - print('release_date:', release_date) - print('*** Flu ***') - for region in sorted(list(fdata[ew].keys())): - visits, ili = fdata[ew][region]['visits'], fdata[ew][region]['ili'] - print('region=%s | visits=%d | ili=%.3f' % (region, visits, ili)) - print('*** Dengue ***') - for location in sorted(list(ddata[ew].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = ddata[ew][location] - print('location=%s | region=%s | count=%d' % (location, region, count)) + # Print the results + print("*** Meta ***") + print("latest_week:", latest_week) + print("release_date:", release_date) + print("*** Flu ***") + for region in sorted(list(fdata[ew].keys())): + visits, ili = fdata[ew][region]["visits"], fdata[ew][region]["ili"] + print("region=%s | visits=%d | ili=%.3f" % (region, visits, ili)) + print("*** Dengue ***") + for location in sorted(list(ddata[ew].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = ddata[ew][location] + print("location=%s | region=%s | count=%d" % (location, region, count)) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/nidss/taiwan_update.py b/src/acquisition/nidss/taiwan_update.py index 830a7738d..c22f0dfaa 100644 --- a/src/acquisition/nidss/taiwan_update.py +++ b/src/acquisition/nidss/taiwan_update.py @@ -87,92 +87,88 @@ # Get a row count just to know how many new rows are inserted def get_rows(cnx): - select = cnx.cursor() - select.execute('SELECT count(1) num FROM nidss_flu') - for (num,) in select: - rows_flu = num - select.execute('SELECT count(1) num FROM nidss_dengue') - for (num,) in select: - rows_dengue = num - select.close() - return (rows_flu, rows_dengue) + select = cnx.cursor() + select.execute("SELECT count(1) num FROM nidss_flu") + for (num,) in select: + rows_flu = num + select.execute("SELECT count(1) num FROM nidss_dengue") + for (num,) in select: + rows_dengue = num + select.close() + return (rows_flu, rows_dengue) def update(test_mode=False): - # test mode - if test_mode: - print('test mode enabled: changes will not be saved') - - # Database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before (flu): %d' % (rows1[0])) - print('rows before (dengue): %d' % (rows1[1])) - insert = cnx.cursor() - sql_flu = ''' - INSERT INTO - `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) - VALUES - (%s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s - ''' - sql_dengue = ''' - INSERT INTO - `nidss_dengue` (`epiweek`, `location`, `region`, `count`) - VALUES - (%s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `count` = %s - ''' - - # Scrape flu data - current_week, release_date, data = NIDSS.get_flu_data() - for epiweek in sorted(list(data.keys())): - lag = delta_epiweeks(epiweek, current_week) - for region in data[epiweek].keys(): - visits, ili = data[epiweek][region]['visits'], data[epiweek][region]['ili'] - params1 = [release_date, current_week, epiweek, region, lag, visits, ili] - params2 = [release_date, visits, ili] - insert.execute(sql_flu, tuple(params1 + params2)) - - # Scrape dengue data from the past year - data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) - for epiweek in sorted(list(data.keys())): - for location in sorted(list(data[epiweek].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = data[epiweek][location] - params = (epiweek, location, region, count, count) - insert.execute(sql_dengue, params) - - # Cleanup - insert.close() - rows2 = get_rows(cnx) - print('rows after (flu): %d (added %d)' % (rows2[0], rows2[0] - rows1[0])) - print('rows after (dengue): %d (added %d)' % (rows2[1], rows2[1] - rows1[1])) - if test_mode: - print('test mode: changes not commited') - else: - cnx.commit() - cnx.close() + # test mode + if test_mode: + print("test mode enabled: changes will not be saved") + + # Database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx) + print("rows before (flu): %d" % (rows1[0])) + print("rows before (dengue): %d" % (rows1[1])) + insert = cnx.cursor() + sql_flu = """ + INSERT INTO + `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) + VALUES + (%s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s + """ + sql_dengue = """ + INSERT INTO + `nidss_dengue` (`epiweek`, `location`, `region`, `count`) + VALUES + (%s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `count` = %s + """ + + # Scrape flu data + current_week, release_date, data = NIDSS.get_flu_data() + for epiweek in sorted(list(data.keys())): + lag = delta_epiweeks(epiweek, current_week) + for region in data[epiweek].keys(): + visits, ili = data[epiweek][region]["visits"], data[epiweek][region]["ili"] + params1 = [release_date, current_week, epiweek, region, lag, visits, ili] + params2 = [release_date, visits, ili] + insert.execute(sql_flu, tuple(params1 + params2)) + + # Scrape dengue data from the past year + data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) + for epiweek in sorted(list(data.keys())): + for location in sorted(list(data[epiweek].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = data[epiweek][location] + params = (epiweek, location, region, count, count) + insert.execute(sql_dengue, params) + + # Cleanup + insert.close() + rows2 = get_rows(cnx) + print("rows after (flu): %d (added %d)" % (rows2[0], rows2[0] - rows1[0])) + print("rows after (dengue): %d (added %d)" % (rows2[1], rows2[1] - rows1[1])) + if test_mode: + print("test mode: changes not commited") + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - '-t', - action='store_true', - default=False, - help='test mode, do not commit changes' - ) - args = parser.parse_args() - - # fetch and store NIDSS data - update(args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "--test", "-t", action="store_true", default=False, help="test mode, do not commit changes" + ) + args = parser.parse_args() + + # fetch and store NIDSS data + update(args.test) + + +if __name__ == "__main__": + main() From 7f60fbba572c1b6e5153a9ef216895bdc2f7f5b3 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:07:32 -0700 Subject: [PATCH 26/43] style(black): format paho acquisition --- src/acquisition/paho/paho_db_update.py | 137 ++++++++++++++----------- src/acquisition/paho/paho_download.py | 114 ++++++++++++-------- 2 files changed, 147 insertions(+), 104 deletions(-) diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index d07885f79..08577f580 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -50,9 +50,8 @@ import csv import datetime import glob -import subprocess -import random from io import StringIO +import tempfile # third party import mysql.connector @@ -64,12 +63,14 @@ from delphi.utils.epiweek import delta_epiweeks, check_epiweek from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `paho_dengue` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -85,35 +86,44 @@ def ensure_tables_exist(): `num_deaths` INT(11) NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='paho_dengue'): - # Count and return the number of rows in the `fluview` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="paho_dengue"): + # Count and return the number of rows in the `fluview` table. + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def get_paho_row(row): - if row[0] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split(","): - raise Exception('PAHO header row has changed') + if row[ + 0 + ] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split( + "," + ): + raise Exception("PAHO header row has changed") if len(row) == 1 or row[0] == "Incidence Rate (c)": # this is a header row return None @@ -128,23 +138,26 @@ def get_paho_row(row): except: return None try: - check_epiweek(safe_int(row[13])*100 + safe_int(row[8]), safe_int(row[13])*100 + safe_int(row[6])) + check_epiweek( + safe_int(row[13]) * 100 + safe_int(row[8]), safe_int(row[13]) * 100 + safe_int(row[6]) + ) except: return None return { - 'issue': safe_int(row[13])*100 + safe_int(row[6]), - 'epiweek': safe_int(row[13])*100 + safe_int(row[8]), - 'region': country, - 'total_pop': safe_int(row[14]), - 'serotype': row[10], - 'num_dengue': safe_int(row[12]), - 'incidence_rate': safe_float(row[0]), - 'num_severe': safe_int(row[11]), - 'num_deaths': safe_int(row[5]), - 'severe_ratio': safe_float(row[1]), - 'cfr': safe_float(row[2]) + "issue": safe_int(row[13]) * 100 + safe_int(row[6]), + "epiweek": safe_int(row[13]) * 100 + safe_int(row[8]), + "region": country, + "total_pop": safe_int(row[14]), + "serotype": row[10], + "num_dengue": safe_int(row[12]), + "incidence_rate": safe_float(row[0]), + "num_severe": safe_int(row[11]), + "num_deaths": safe_int(row[5]), + "severe_ratio": safe_float(row[1]), + "cfr": safe_float(row[2]), } + def update_from_file(issue, date, filename, test_mode=False): # Read PAHO data from CSV and insert into (or update) the database. @@ -156,23 +169,23 @@ def update_from_file(issue, date, filename, test_mode=False): # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, 'paho_dengue') - print('rows before: %d' % (rows1)) + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, "paho_dengue") + print("rows before: %d" % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - with open(filename,'r',encoding='utf-8') as f: + print("loading data from %s as issued on %d" % (filename, issue)) + with open(filename, encoding="utf-8") as f: c = f.read() rows = [] - for l in csv.reader(StringIO(c), delimiter=','): + for l in csv.reader(StringIO(c), delimiter=","): rows.append(get_paho_row(l)) - print(' loaded %d rows' % len(rows)) + print(" loaded %d rows" % len(rows)) entries = [obj for obj in rows if obj] - print(' found %d entries' % len(entries)) + print(" found %d entries" % len(entries)) - sql = ''' + sql = """ INSERT INTO `paho_dengue` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `total_pop`, `serotype`, `num_dengue`, `incidence_rate`, @@ -187,55 +200,56 @@ def update_from_file(issue, date, filename, test_mode=False): `incidence_rate` = %s, `num_severe` = %s, `num_deaths` = %s - ''' + """ for row in entries: - if row['issue'] > issue: # Issued in a week that hasn't happened yet + if row["issue"] > issue: # Issued in a week that hasn't happened yet continue - lag = delta_epiweeks(row['epiweek'], issue) - data_args = [row['total_pop'], row['serotype'], row['num_dengue'], - row['incidence_rate'], row['num_severe'], row['num_deaths']] + lag = delta_epiweeks(row["epiweek"], issue) + data_args = [ + row["total_pop"], + row["serotype"], + row["num_dengue"], + row["incidence_rate"], + row["num_severe"], + row["num_deaths"], + ] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row["epiweek"], row["region"], lag] + data_args update_args = [date] + data_args insert.execute(sql % tuple(insert_args + update_args)) # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' + "--test", action="store_true", help="do dry run only, do not update the database" ) parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' + "--file", type=str, help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' + "--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given" ) args = parser.parse_args() if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') + raise Exception("--file and --issue must both be present or absent") - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) if args.file: update_from_file(args.issue, date, args.file, test_mode=args.test) @@ -274,7 +288,8 @@ def main(): if not db_error: break # Exit loop with success if flag >= max_tries: - print('WARNING: Database `paho_dengue` did not update successfully') + print("WARNING: Database `paho_dengue` did not update successfully") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/paho/paho_download.py b/src/acquisition/paho/paho_download.py index 60dd13ae8..5308ec93f 100644 --- a/src/acquisition/paho/paho_download.py +++ b/src/acquisition/paho/paho_download.py @@ -1,4 +1,3 @@ - # IMPORTANT: This code is extremely unstable. # Slight changes to the PAHO website may render this script partially or entirely useless. @@ -15,42 +14,51 @@ headerheight = 0 + def wait_for(browser, css_selector, delay=10): try: - WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) - WebDriverWait(browser, delay).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))) - print('Success Loading %s' % (css_selector)) + WebDriverWait(browser, delay).until( + EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)) + ) + WebDriverWait(browser, delay).until( + EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector)) + ) + print("Success Loading %s" % (css_selector)) except TimeoutException: print("Loading %s took too much time!" % (css_selector)) - + + def find_and_click(browser, element): element.location_once_scrolled_into_view browser.switch_to.default_content() - browser.execute_script("window.scrollBy(0,-%d)"%headerheight) + browser.execute_script("window.scrollBy(0,-%d)" % headerheight) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) element.click() -def get_paho_data(offset=0, dir='downloads'): + +def get_paho_data(offset=0, dir="downloads"): opts = Options() opts.set_headless() assert opts.headless # Operating in headless mode fp = FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") - - browser = Firefox(options=opts,firefox_profile=fp) - browser.get('http://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1') + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") + + browser = Firefox(options=opts, firefox_profile=fp) + browser.get( + "https://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1" + ) tab1 = browser.window_handles[0] - browser.execute_script('''window.open("","_blank");''') + browser.execute_script("""window.open("","_blank");""") tab2 = browser.window_handles[1] browser.switch_to.window(tab1) - + curr_offset = offset - + wait_for(browser, "div.rt-top-inner", delay=30) header = browser.find_element_by_css_selector("div.rt-top-inner") global headerheight @@ -59,41 +67,51 @@ def get_paho_data(offset=0, dir='downloads'): # The actual content of the data of this webpage is within 2 iframes, so we need to navigate into them first browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) - + # Locate the button that allows to download the table - downloadoption = browser.find_elements_by_css_selector("div.tabToolbarButton.tab-widget.download")[0] + downloadoption = browser.find_elements_by_css_selector( + "div.tabToolbarButton.tab-widget.download" + )[0] find_and_click(browser, downloadoption) wait_for(browser, "div[data-tb-test-id='DownloadImage-Button']") # Locate the button that prepares the table for download as an image - imagebutton = browser.find_elements_by_css_selector("div[data-tb-test-id='DownloadImage-Button']")[0] + imagebutton = browser.find_elements_by_css_selector( + "div[data-tb-test-id='DownloadImage-Button']" + )[0] find_and_click(browser, imagebutton) wait_for(browser, ".tabDownloadFileButton[data-test-id='DownloadLink']") # Locate the button that downloads the table as an image - downloadbutton = browser.find_elements_by_css_selector(".tabDownloadFileButton[data-test-id='DownloadLink']")[0] + downloadbutton = browser.find_elements_by_css_selector( + ".tabDownloadFileButton[data-test-id='DownloadLink']" + )[0] # Extract session ID href = downloadbutton.get_attribute("href") startidx = href.index("sessions/") + len("sessions/") - endidx = href.index("/",startidx) + endidx = href.index("/", startidx) sessionid = href[startidx:endidx] - dataurl = "http://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/%s/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D"%sessionid + dataurl = f"https://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/{sessionid}/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D" wait_for(browser, "div[data-tb-test-id='CancelBtn-Button']") # Cancel image download - cancelbutton = browser.find_elements_by_css_selector("div[data-tb-test-id='CancelBtn-Button']")[0] + cancelbutton = browser.find_elements_by_css_selector("div[data-tb-test-id='CancelBtn-Button']")[ + 0 + ] find_and_click(browser, cancelbutton) wait_for(browser, "div[id='tableau_base_widget_FilterPanel_0']") # Default is to show data for current year, we want to get all years # Clicks drop-down menu to open options - yearselector = browser.find_elements_by_css_selector("div[id='tableau_base_widget_FilterPanel_0']")[0] + yearselector = browser.find_elements_by_css_selector( + "div[id='tableau_base_widget_FilterPanel_0']" + )[0] find_and_click(browser, yearselector) wait_for(browser, "div.facetOverflow") @@ -107,27 +125,29 @@ def get_paho_data(offset=0, dir='downloads'): for i in range(offset): gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() + # print gp.is_enabled() + # print gp.is_selected() + # print gp.is_displayed() try: WebDriverWait(browser, 10).until(EC.staleness_of(gp)) - print("Loaded next week % d" % (53-offset)) + print("Loaded next week % d" % (53 - offset)) except TimeoutException: - print("Loading next week %d took too much time!" % (53-offset)) + print("Loading next week %d took too much time!" % (53 - offset)) gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() - x = browser.find_elements_by_css_selector("div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec")[0] + # print gp.is_enabled() + # print gp.is_selected() + # print gp.is_displayed() + x = browser.find_elements_by_css_selector( + "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec" + )[0] find_and_click(browser, x) # Cycle through all weeks, downloading each week as a separate .csv # Theoretically, need to cycle 53 times, but in practice only 54 works, unsure why - for i in range(54-offset): + for i in range(54 - offset): # If something goes wrong for whatever reason, try from the beginning try: - print('Loading week %d' % (53-i)) + print("Loading week %d" % (53 - i)) # (Re-)load URL browser.switch_to.window(tab2) browser.get(dataurl) @@ -137,7 +157,9 @@ def get_paho_data(offset=0, dir='downloads'): full_data_tab = browser.find_elements_by_css_selector("li[id='tab-view-full-data']")[0] full_data_tab.click() - wait_for(browser, "a.csvLink") # Sometimes this fails but the button is successfully clicked anyway, not sure why + wait_for( + browser, "a.csvLink" + ) # Sometimes this fails but the button is successfully clicked anyway, not sure why # Actually download the data as a .csv (Will be downloaded to Firefox's default download destination) data_links = browser.find_elements_by_css_selector("a.csvLink") data_link = None @@ -149,16 +171,22 @@ def get_paho_data(offset=0, dir='downloads'): # Locate button that decreases the current week by 1 browser.switch_to.window(tab1) - wait_for(browser, "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec") - - x = browser.find_elements_by_css_selector("div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec")[0] + wait_for( + browser, + "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec", + ) + + x = browser.find_elements_by_css_selector( + "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec" + )[0] find_and_click(browser, x) curr_offset += 1 except Exception as e: - print('Got exception %s\nTrying again from week %d' % (e,53-offset)) + print("Got exception %s\nTrying again from week %d" % (e, 53 - offset)) browser.quit() get_paho_data(offset=curr_offset) browser.quit() -if __name__ == '__main__': - get_paho_data(dir='downloads/') + +if __name__ == "__main__": + get_paho_data(dir="downloads/") From b9ceb400d9248c8271e8342275664ac5524e335d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:08:31 -0700 Subject: [PATCH 27/43] style(black): format quidel acquisition --- src/acquisition/quidel/quidel.py | 232 ++++++++++++++---------- src/acquisition/quidel/quidel_update.py | 202 +++++++++++---------- 2 files changed, 245 insertions(+), 189 deletions(-) diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index a7c9a2918..3af99774f 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -15,7 +15,7 @@ * add end date, end week check 2017-12-02: * original version -''' +""" # standard library from collections import defaultdict @@ -35,148 +35,187 @@ import delphi.utils.epidate as ED from delphi.utils.geo.locations import Locations -def word_map(row,terms): - for (k,v) in terms.items(): - row = row.replace(k,v) + +def word_map(row, terms): + for (k, v) in terms.items(): + row = row.replace(k, v) return row -def date_less_than(d1,d2): - y1,m1,d1 = [int(x) for x in d1.split('-')] - y2,m2,d2 = [int(x) for x in d2.split('-')] - if y1*10000+m1*100+d10: shifted to future def date_to_epiweek(date, shift=0): - y,m,d = [int(x) for x in date.split('-')] + y, m, d = (int(x) for x in date.split("-")) - epidate = ED.EpiDate(y,m,d) + epidate = ED.EpiDate(y, m, d) epidate = epidate.add_days(shift) ew = epidate.get_ew() return ew + # convert measurment to time series format # startweek and endweek are inclusive -def measurement_to_ts(m,index,startweek=None,endweek=None): +def measurement_to_ts(m, index, startweek=None, endweek=None): if startweek is None: startweek = 0 if endweek is None: endweek = 999999 res = {} - for r,rdict in m.items(): - res[r]={} - for t,vals in rdict.items(): - if index>=len(vals): + for r, rdict in m.items(): + res[r] = {} + for t, vals in rdict.items(): + if index >= len(vals): raise Exception("Index is invalid") - if t>=startweek and t<=endweek: + if t >= startweek and t <= endweek: res[r][t] = vals[index] return res + class QuidelData: def __init__(self, raw_path, load_email=True): self.data_path = raw_path - self.excel_uptodate_path = join(raw_path,'excel/uptodate') - self.excel_history_path = join(raw_path,'excel/history') - self.csv_path = join(raw_path,'csv') + self.excel_uptodate_path = join(raw_path, "excel/uptodate") + self.excel_history_path = join(raw_path, "excel/history") + self.csv_path = join(raw_path, "csv") self.xlsx_uptodate_list = [ - f[:-5] for f in listdir(self.excel_uptodate_path) if isfile(join(self.excel_uptodate_path, f)) and f[-5:]=='.xlsx' + f[:-5] + for f in listdir(self.excel_uptodate_path) + if isfile(join(self.excel_uptodate_path, f)) and f[-5:] == ".xlsx" ] self.xlsx_history_list = [ - f[:-5] for f in listdir(self.excel_history_path) if isfile(join(self.excel_history_path, f)) and f[-5:]=='.xlsx' + f[:-5] + for f in listdir(self.excel_history_path) + if isfile(join(self.excel_history_path, f)) and f[-5:] == ".xlsx" + ] + self.csv_list = [ + f[:-4] + for f in listdir(self.csv_path) + if isfile(join(self.csv_path, f)) and f[-4:] == ".csv" ] - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] self.map_terms = { - ' FL 34637"':'FL', + ' FL 34637"': "FL", } # hardcoded parameters self.date_dim = 1 self.state_dim = 4 self.fields = [ - 'sofia_ser','date','fac_id','city','state','zip','age', - 'fluA','fluB','fluAll','county','fac_type' + "sofia_ser", + "date", + "fac_id", + "city", + "state", + "zip", + "age", + "fluA", + "fluB", + "fluAll", + "county", + "fac_type", ] - self.fields_to_keep = ['fac_id','fluA','fluB','fluAll'] + self.fields_to_keep = ["fac_id", "fluA", "fluB", "fluAll"] self.dims_to_keep = [self.fields.index(x) for x in self.fields_to_keep] if load_email: self.retrieve_excels() self.prepare_csv() def retrieve_excels(self): - detach_dir = self.excel_uptodate_path # directory where to save attachments (default: current) + detach_dir = ( + self.excel_uptodate_path + ) # directory where to save attachments (default: current) # connecting to the gmail imap server m = imaplib.IMAP4_SSL("imap.gmail.com") - m.login(secrets.quidel.email_addr,secrets.quidel.email_pwd) - m.select("INBOX") # here you a can choose a mail box like INBOX instead + m.login(secrets.quidel.email_addr, secrets.quidel.email_pwd) + m.select("INBOX") # here you a can choose a mail box like INBOX instead # use m.list() to get all the mailboxes - _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) - items = items[0].split() # getting the mails id + # you could filter using the IMAP rules here (check https://www.example-code.com/csharp/imap-search-critera.asp) + _, items = m.search(None, "ALL") + items = items[0].split() # getting the mails id # The emailids are ordered from past to now for emailid in items: - _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc - email_body = data[0][1].decode('utf-8') # getting the mail content - mail = email.message_from_string(email_body) # parsing the mail content to get a mail object - - #Check if any attachments at all - if mail.get_content_maintype() != 'multipart': + _, data = m.fetch( + emailid, "(RFC822)" + ) # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc + email_body = data[0][1].decode("utf-8") # getting the mail content + mail = email.message_from_string( + email_body + ) # parsing the mail content to get a mail object + + # Check if any attachments at all + if mail.get_content_maintype() != "multipart": continue # we use walk to create a generator so we can iterate on the parts and forget about the recursive headach for part in mail.walk(): # multipart are just containers, so we skip them - if part.get_content_maintype() == 'multipart': + if part.get_content_maintype() == "multipart": continue # is this part an attachment ? - if part.get('Content-Disposition') is None: + if part.get("Content-Disposition") is None: continue filename = part.get_filename() # check duplicates - if filename[-5:]!='.xlsx' or filename[:-5] in self.xlsx_uptodate_list+self.xlsx_history_list: + if ( + filename[-5:] != ".xlsx" + or filename[:-5] in self.xlsx_uptodate_list + self.xlsx_history_list + ): continue self.xlsx_uptodate_list.append(filename[:-5]) att_path = os.path.join(detach_dir, filename) - #Check if its already there - if not os.path.isfile(att_path) : + # Check if its already there + if not os.path.isfile(att_path): # finally write the stuff - fp = open(att_path, 'wb') + fp = open(att_path, "wb") fp.write(part.get_payload(decode=True)) fp.close() def prepare_csv(self): - need_update=False + need_update = False for f in self.xlsx_uptodate_list: if f in self.csv_list: continue else: - need_update=True + need_update = True - date_regex = '\d{2}-\d{2}-\d{4}' - date_items = re.findall(date_regex,f) + date_regex = r"\d{2}-\d{2}-\d{4}" + date_items = re.findall(date_regex, f) if date_items: - end_date = '-'.join(date_items[-1].split('-')[x] for x in [2,0,1]) + end_date = "-".join(date_items[-1].split("-")[x] for x in [2, 0, 1]) else: - print("End date not found in file name:"+f) + print("End date not found in file name:" + f) end_date = None - df_dict = pd.read_excel(join(self.excel_uptodate_path, f+'.xlsx'), sheet_name=None) - for (_,df) in df_dict.items(): - df = df.dropna(axis=0, how='all') - df['TestDate'] = df['TestDate'].apply(lambda x: x.strftime('%Y-%m-%d')) - df_filtered = df[df['TestDate']!=''] + df_dict = pd.read_excel(join(self.excel_uptodate_path, f + ".xlsx"), sheet_name=None) + for (_, df) in df_dict.items(): + df = df.dropna(axis=0, how="all") + df["TestDate"] = df["TestDate"].apply(lambda x: x.strftime("%Y-%m-%d")) + df_filtered = df[df["TestDate"] != ""] if end_date is not None: - df_filtered = df_filtered[df.apply(lambda x: date_less_than(end_date,x['TestDate'])!=1, axis=1)] - df_filtered.to_csv(join(self.csv_path, f+'.csv'), index=False, encoding='utf-8') - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] + df_filtered = df_filtered[ + df.apply(lambda x: date_less_than(end_date, x["TestDate"]) != 1, axis=1) + ] + df_filtered.to_csv(join(self.csv_path, f + ".csv"), index=False, encoding="utf-8") + self.csv_list = [ + f[:-4] + for f in listdir(self.csv_path) + if isfile(join(self.csv_path, f)) and f[-4:] == ".csv" + ] self.need_update = need_update def load_csv(self, dims=None): @@ -186,12 +225,12 @@ def load_csv(self, dims=None): for f in self.csv_list: if f in self.xlsx_history_list: continue - rf = open(join(self.csv_path,f+'.csv')) + rf = open(join(self.csv_path, f + ".csv")) lines = rf.readlines() for l in lines[1:]: - l = word_map(l,self.map_terms) - row = l.strip().split(',') + l = word_map(l, self.map_terms) + row = l.strip().split(",") date = row[self.date_dim] state = row[self.state_dim] if state not in parsed_dict[date]: @@ -202,7 +241,7 @@ def load_csv(self, dims=None): # hardcoded aggregation function # output: [#unique_device,fluA,fluB,fluAll,total] - def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): + def prepare_measurements(self, data_dict, use_hhs=True, start_weekday=6): buffer_dict = {} if use_hhs: region_list = Locations.hhs_list @@ -210,34 +249,35 @@ def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): region_list = Locations.atom_list def get_hhs_region(atom): - for region in Locations.hhs_list: - if atom.lower() in Locations.hhs_map[region]: - return region - if atom.lower() == 'ny': - return 'hhs2' - return atom + for region in Locations.hhs_list: + if atom.lower() in Locations.hhs_map[region]: + return region + if atom.lower() == "ny": + return "hhs2" + return atom day_shift = 6 - start_weekday - time_map = lambda x:date_to_epiweek(x,shift=day_shift) - region_map = lambda x:get_hhs_region(x) \ - if use_hhs and x not in Locations.hhs_list else x # a bit hacky + time_map = lambda x: date_to_epiweek(x, shift=day_shift) + region_map = ( + lambda x: get_hhs_region(x) if use_hhs and x not in Locations.hhs_list else x + ) # a bit hacky end_date = sorted(data_dict.keys())[-1] # count the latest week in only if Thurs data is included - end_epiweek = date_to_epiweek(end_date,shift=-4) + end_epiweek = date_to_epiweek(end_date, shift=-4) # first pass: prepare device_id set device_dict = {} - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): if not date: continue ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in device_dict: - device_dict[ew]={} + device_dict[ew] = {} for r in region_list: device_dict[ew][r] = set() - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: @@ -247,38 +287,40 @@ def get_hhs_region(atom): device_dict[ew][region].add(fac) # second pass: prepare all measurements - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in buffer_dict: - buffer_dict[ew]={} + buffer_dict[ew] = {} for r in region_list: - buffer_dict[ew][r] = [0.0]*8 + buffer_dict[ew][r] = [0.0] * 8 - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: continue for rec in rec_list: fac_num = float(len(device_dict[ew][region])) - buffer_dict[ew][region]= np.add( - buffer_dict[ew][region],[ - rec[1]=='positive', - rec[2]=='positive', - rec[3]=='positive', + buffer_dict[ew][region] = np.add( + buffer_dict[ew][region], + [ + rec[1] == "positive", + rec[2] == "positive", + rec[3] == "positive", 1.0, - float(rec[1]=='positive')/fac_num, - float(rec[2]=='positive')/fac_num, - float(rec[3]=='positive')/fac_num, - 1.0/fac_num, - ]).tolist() + float(rec[1] == "positive") / fac_num, + float(rec[2] == "positive") / fac_num, + float(rec[3] == "positive") / fac_num, + 1.0 / fac_num, + ], + ).tolist() # switch two dims of dict result_dict = {} for r in region_list: - result_dict[r]={} - for (k,v) in buffer_dict.items(): - result_dict[r][k]=v[r] + result_dict[r] = {} + for (k, v) in buffer_dict.items(): + result_dict[r][k] = v[r] return result_dict diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index b6303533c..286a30834 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -33,7 +33,7 @@ 2017-12-02: * original version -''' +""" # standard library import argparse @@ -49,106 +49,120 @@ from delphi.utils.geo.locations import Locations LOCATIONS = Locations.hhs_list -DATAPATH = '/home/automation/quidel_data' +DATAPATH = "/home/automation/quidel_data" + def update(locations, first=None, last=None, force_update=False, load_email=True): - # download and prepare data first - qd = quidel.QuidelData(DATAPATH,load_email) - if not qd.need_update and not force_update: - print('Data not updated, nothing needs change.') - return - - qd_data = qd.load_csv() - qd_measurements = qd.prepare_measurements(qd_data,start_weekday=4) - qd_ts = quidel.measurement_to_ts(qd_measurements,7,startweek=first,endweek=last) - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `quidel`') - for (num,) in cur: - pass - return num - - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check Quidel for new and/or revised data - sql = ''' + # download and prepare data first + qd = quidel.QuidelData(DATAPATH, load_email) + if not qd.need_update and not force_update: + print("Data not updated, nothing needs change.") + return + + qd_data = qd.load_csv() + qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4) + qd_ts = quidel.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last) + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `quidel`") + for (num,) in cur: + pass + return num + + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print("Checking epiweeks between %d and %d..." % (ew0, ew1)) + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check Quidel for new and/or revised data + sql = """ INSERT INTO `quidel` (`location`, `epiweek`, `value`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s - ''' - - total_rows = 0 - - for location in locations: - if location not in qd_ts: - continue - ews = sorted(qd_ts[location].keys()) - num_missing = 0 - for ew in ews: - v = qd_ts[location][ew] - sql_data = (location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - if num_missing > 0: - print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + """ + + total_rows = 0 + + for location in locations: + if location not in qd_ts: + continue + ews = sorted(qd_ts[location].keys()) + num_missing = 0 + for ew in ews: + v = qd_ts[location][ew] + sql_data = (location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + if num_missing > 0: + print(" [%s] missing %d/%d value(s)" % (location, num_missing, len(ews))) + + # keep track of how many rows were added + rows_after = get_num_rows() + print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--location', action='store', type=str, default=None, help='location(s) (ex: all; any of hhs1-10)') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--force_update', '-u', action='store_true', help='force update db values') - parser.add_argument('--skip_email', '-s', action='store_true', help='skip email downloading step') - args = parser.parse_args() - - # sanity check - first, last, force_update, skip_email = args.first, args.last, args.force_update, args.skip_email - load_email = not skip_email - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - else: - locations = args.location.lower().split(',') - - # run the update - update(locations, first, last, force_update, load_email) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "--location", + action="store", + type=str, + default=None, + help="location(s) (ex: all; any of hhs1-10)", + ) + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument("--force_update", "-u", action="store_true", help="force update db values") + parser.add_argument( + "--skip_email", "-s", action="store_true", help="skip email downloading step" + ) + args = parser.parse_args() + + # sanity check + first, last, force_update, skip_email = ( + args.first, + args.last, + args.force_update, + args.skip_email, + ) + load_email = not skip_email + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + + # decide what to update + if args.location.lower() == "all": + locations = LOCATIONS + else: + locations = args.location.lower().split(",") + + # run the update + update(locations, first, last, force_update, load_email) + + +if __name__ == "__main__": + main() From 07ed83e5768f717ab0f9a62a9209e4e2cffa058d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:08:59 -0700 Subject: [PATCH 28/43] style(black): format twitter acquisition --- src/acquisition/twtr/healthtweets.py | 342 ++++++++++++++++--------- src/acquisition/twtr/pageparser.py | 121 +++++---- src/acquisition/twtr/twitter_update.py | 99 +++---- 3 files changed, 329 insertions(+), 233 deletions(-) diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index 78eb2b3ec..31976f376 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -20,7 +20,7 @@ * Fetching daily values instead of weekly values 2015-03-?? * Original version -''' +""" # standard library import argparse @@ -36,132 +36,220 @@ class HealthTweets: - # mapping from state abbreviations to location codes used by healthtweets.org - STATE_CODES = {'AL': 3024, 'AK': 3025, 'AZ': 3026, 'AR': 3027, 'CA': 440, 'CO': 3029, 'CT': 3030, 'DE': 3031, 'DC': 3032, 'FL': 3033, 'GA': 3034, 'HI': 3035, 'ID': 3036, 'IL': 3037, 'IN': 3038, 'IA': 3039, 'KS': 3040, 'KY': 3041, 'LA': 2183, 'ME': 3043, 'MD': 3044, 'MA': 450, 'MI': 3046, 'MN': 3047, 'MS': 3048, 'MO': 3049, 'MT': 3050, 'NE': 3051, 'NV': 3052, 'NH': 3053, 'NJ': 478, 'NM': 2225, 'NY': 631, 'NC': 3057, 'ND': 3058, 'OH': 3059, 'OK': 3060, 'OR': 281, 'PA': 3062, 'RI': 3063, 'SC': 3064, 'SD': 3065, 'TN': 3066, 'TX': 3067, 'UT': 2272, 'VT': 3069, 'VA': 3070, 'WA': 3071, 'WV': 3072, 'WI': 3073, 'WY': 3074} - - def __init__(self, username, password, debug=False): - self.debug = debug - self.session = requests.Session() - # spoof a web browser - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', - }) - # get the login token - response = self._go('http://www.healthtweets.org/accounts/login') - token = self._get_token(response.text) - if self.debug: - print('token=%s'%(token)) - data = { - 'csrfmiddlewaretoken': token, - 'username': username, - 'password': password, - 'next': '/', + # mapping from state abbreviations to location codes used by healthtweets.org + STATE_CODES = { + "AL": 3024, + "AK": 3025, + "AZ": 3026, + "AR": 3027, + "CA": 440, + "CO": 3029, + "CT": 3030, + "DE": 3031, + "DC": 3032, + "FL": 3033, + "GA": 3034, + "HI": 3035, + "ID": 3036, + "IL": 3037, + "IN": 3038, + "IA": 3039, + "KS": 3040, + "KY": 3041, + "LA": 2183, + "ME": 3043, + "MD": 3044, + "MA": 450, + "MI": 3046, + "MN": 3047, + "MS": 3048, + "MO": 3049, + "MT": 3050, + "NE": 3051, + "NV": 3052, + "NH": 3053, + "NJ": 478, + "NM": 2225, + "NY": 631, + "NC": 3057, + "ND": 3058, + "OH": 3059, + "OK": 3060, + "OR": 281, + "PA": 3062, + "RI": 3063, + "SC": 3064, + "SD": 3065, + "TN": 3066, + "TX": 3067, + "UT": 2272, + "VT": 3069, + "VA": 3070, + "WA": 3071, + "WV": 3072, + "WI": 3073, + "WY": 3074, } - # login to the site - response = self._go('http://www.healthtweets.org/accounts/login', data=data) - if response.status_code != 200 or 'Your username and password' in response.text: - raise Exception('login failed') - - def get_values(self, state, date1, date2): - ''' - state: two-letter state abbreviation (see STATE_CODES) - date1: the first date in the range, inclusive (format: YYYY-MM-DD) - date2: the last date in the range, inclusive (format: YYYY-MM-DD) - returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) - ''' - # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) - raw_values = self._get_values(state, date1, date2, False) - normalized_values = self._get_values(state, date1, date2, True) - values = {} - # save the raw number and calculate the total - for date in raw_values.keys(): - if normalized_values[date] == 0: - continue - values[date] = { - 'num': round(raw_values[date]), - 'total': round(100 * raw_values[date] / normalized_values[date]), - } - print(date, raw_values[date], normalized_values[date]) - return values - - def _get_values(self, state, date1, date2, normalized): - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d') - s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y') - count_type = 'normalized' if normalized else 'raw' - url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code) - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code)) - #print(state, date1, date2, normalized) - #print(url) - #print(response.status_code) - if response.status_code != 200: - raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)') - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:16] == 'var chartData = '] - if len(data_line) != 1: - raise Exception('lookup failed') - values = json.loads(data_line[0][16:-1]) - return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values]) - - def check_state(self, state): - ''' - Sanity checks state code mapping. - state: two-letter state abbreviation (see STATE_CODES) - returns the full state name associated with the state abbreviation - ''' - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d'%(state_code)) - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] - if len(data_line) == 0: - raise Exception('check failed') - name = data_line[0][29:] - name = name.split('(')[0] - return name.strip() - - def _get_token(self, html): - page = PageParser.parse(html) - hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)]) - return hidden['attrs']['value'] - - def _go(self, url, method=None, referer=None, data=None): - if self.debug: - print('%s'%(url)) - if method is None: - if data is None: - method = self.session.get - else: - method = self.session.post - response = method(url, headers={'referer': referer}, data=data) - html = response.text - if self.debug: - for item in response.history: - print(' [%d to %s]'%(item.status_code, item.headers['Location'])) - print(' %d (%d bytes)'%(response.status_code, len(html))) - return response + + def __init__(self, username, password, debug=False): + self.debug = debug + self.session = requests.Session() + # spoof a web browser + self.session.headers.update( + { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", + } + ) + # get the login token + response = self._go("https://www.healthtweets.org/accounts/login") + token = self._get_token(response.text) + if self.debug: + print("token=%s" % (token)) + data = { + "csrfmiddlewaretoken": token, + "username": username, + "password": password, + "next": "/", + } + # login to the site + response = self._go("https://www.healthtweets.org/accounts/login", data=data) + if response.status_code != 200 or "Your username and password" in response.text: + raise Exception("login failed") + + def get_values(self, state, date1, date2): + """ + state: two-letter state abbreviation (see STATE_CODES) + date1: the first date in the range, inclusive (format: YYYY-MM-DD) + date2: the last date in the range, inclusive (format: YYYY-MM-DD) + returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) + """ + # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) + raw_values = self._get_values(state, date1, date2, False) + normalized_values = self._get_values(state, date1, date2, True) + values = {} + # save the raw number and calculate the total + for date in raw_values.keys(): + if normalized_values[date] == 0: + continue + values[date] = { + "num": round(raw_values[date]), + "total": round(100 * raw_values[date] / normalized_values[date]), + } + print(date, raw_values[date], normalized_values[date]) + return values + + def _get_values(self, state, date1, date2, normalized): + if state not in HealthTweets.STATE_CODES: + raise Exception("invalid state") + state_code = HealthTweets.STATE_CODES[state] + d1, d2 = datetime.strptime(date1, "%Y-%m-%d"), datetime.strptime(date2, "%Y-%m-%d") + s1, s2 = d1.strftime("%m%%2F%d%%2F%Y"), d2.strftime("%m%%2F%d%%2F%Y") + count_type = "normalized" if normalized else "raw" + url = ( + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" + % (count_type, (d2 - d1).days, s1, s2, state_code) + ) + response = self._go( + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" + % (count_type, (d2 - d1).days, s1, s2, state_code) + ) + # print(state, date1, date2, normalized) + # print(url) + # print(response.status_code) + if response.status_code != 200: + raise Exception( + "plot status is " + str(response.status_code) + " (when was data last updated?)" + ) + lines = [line.strip() for line in response.text.split("\n")] + data_line = [line for line in lines if line[:16] == "var chartData = "] + if len(data_line) != 1: + raise Exception("lookup failed") + values = json.loads(data_line[0][16:-1]) + return { + datetime.strptime(v[0], "%m/%d/%Y").strftime("%Y-%m-%d"): float(v[1]) for v in values + } + + def check_state(self, state): + """ + Sanity checks state code mapping. + state: two-letter state abbreviation (see STATE_CODES) + returns the full state name associated with the state abbreviation + """ + if state not in HealthTweets.STATE_CODES: + raise Exception("invalid state") + state_code = HealthTweets.STATE_CODES[state] + response = self._go( + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d" % (state_code) + ) + lines = [line.strip() for line in response.text.split("\n")] + data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] + if len(data_line) == 0: + raise Exception("check failed") + name = data_line[0][29:] + name = name.split("(")[0] + return name.strip() + + def _get_token(self, html): + page = PageParser.parse(html) + hidden = PageParser.filter_all( + page, [("html",), ("body",), ("div",), ("div",), ("div",), ("form",), ("input",)] + ) + return hidden["attrs"]["value"] + + def _go(self, url, method=None, referer=None, data=None): + if self.debug: + print("%s" % (url)) + if method is None: + if data is None: + method = self.session.get + else: + method = self.session.post + response = method(url, headers={"referer": referer}, data=data) + html = response.text + if self.debug: + for item in response.history: + print(" [%d to %s]" % (item.status_code, item.headers["Location"])) + print(" %d (%d bytes)" % (response.status_code, len(html))) + return response def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('username', action='store', type=str, help='healthtweets.org username') - parser.add_argument('password', action='store', type=str, help='healthtweets.org password') - parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)') - parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)') - parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() - - ht = HealthTweets(args.username, args.password, debug=args.debug) - values = ht.get_values(args.state, args.date1, args.date2) - print('Daily counts in %s from %s to %s:'%(ht.check_state(args.state), args.date1, args.date2)) - for date in sorted(list(values.keys())): - print('%s: num=%-4d total=%-5d (%.3f%%)'%(date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total'])) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("username", action="store", type=str, help="healthtweets.org username") + parser.add_argument("password", action="store", type=str, help="healthtweets.org password") + parser.add_argument( + "state", + action="store", + type=str, + choices=list(HealthTweets.STATE_CODES.keys()), + help="U.S. state (ex: TX)", + ) + parser.add_argument( + "date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)" + ) + parser.add_argument( + "date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)" + ) + parser.add_argument( + "-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode" + ) + args = parser.parse_args() + + ht = HealthTweets(args.username, args.password, debug=args.debug) + values = ht.get_values(args.state, args.date1, args.date2) + print(f"Daily counts in {ht.check_state(args.state)} from {args.date1} to {args.date2}:") + for date in sorted(list(values.keys())): + print( + "%s: num=%-4d total=%-5d (%.3f%%)" + % ( + date, + values[date]["num"], + values[date]["total"], + 100 * values[date]["num"] / values[date]["total"], + ) + ) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/twtr/pageparser.py b/src/acquisition/twtr/pageparser.py index 5e9aaaea1..2b2183c89 100644 --- a/src/acquisition/twtr/pageparser.py +++ b/src/acquisition/twtr/pageparser.py @@ -5,74 +5,73 @@ class PageParser(HTMLParser): - ''' - This is an HTML parser! All of the hard work is done by the superclass - (which is a Python built-in). This class puts the HTML into a hierarchy - that's (hopefully) easier to work with than raw string parsing. - ''' + """ + This is an HTML parser! All of the hard work is done by the superclass + (which is a Python built-in). This class puts the HTML into a hierarchy + that's (hopefully) easier to work with than raw string parsing. + """ - @staticmethod - def parse(html): - parser = PageParser() - parser.feed(html) - return parser.get_root_node() + @staticmethod + def parse(html): + parser = PageParser() + parser.feed(html) + return parser.get_root_node() - @staticmethod - def banlist(): - '''Commonly unclosed tags''' - return ('br', 'img', 'meta') + @staticmethod + def banlist(): + """Commonly unclosed tags""" + return ("br", "img", "meta") - @staticmethod - def new_node(type): - '''An empty node of the HTML tree''' - return {'type': type, 'attrs': {}, 'nodes': [], 'data': ''} + @staticmethod + def new_node(type): + """An empty node of the HTML tree""" + return {"type": type, "attrs": {}, "nodes": [], "data": ""} - @staticmethod - def filter_all(node, filters): - '''Applies all filters''' - for f in filters: - node = PageParser.filter(node, *f) - return node + @staticmethod + def filter_all(node, filters): + """Applies all filters""" + for f in filters: + node = PageParser.filter(node, *f) + return node - @staticmethod - def filter(node, type, index=0): - '''Finds a sub-node of the given type, specified by index''' - i = 0 - for node in node['nodes']: - if node['type'] == type: - if i == index: - return node - i += 1 - return None + @staticmethod + def filter(node, type, index=0): + """Finds a sub-node of the given type, specified by index""" + i = 0 + for node in node["nodes"]: + if node["type"] == type: + if i == index: + return node + i += 1 + return None - def __init__(self): - HTMLParser.__init__(self) - self.root = PageParser.new_node(None) - self.stack = [self.root] - self.indent = 0 + def __init__(self): + HTMLParser.__init__(self) + self.root = PageParser.new_node(None) + self.stack = [self.root] + self.indent = 0 - def get_root_node(self): - '''After parsing, returns the abstract root node (which contains the html node)''' - return self.root + def get_root_node(self): + """After parsing, returns the abstract root node (which contains the html node)""" + return self.root - def handle_starttag(self, tag, attrs): - '''Inherited - called when a start tag is found''' - if tag in PageParser.banlist(): - return - element = PageParser.new_node(tag) - for (k, v) in attrs: - element['attrs'][k] = v - self.stack[-1]['nodes'].append(element) - self.stack.append(element) + def handle_starttag(self, tag, attrs): + """Inherited - called when a start tag is found""" + if tag in PageParser.banlist(): + return + element = PageParser.new_node(tag) + for (k, v) in attrs: + element["attrs"][k] = v + self.stack[-1]["nodes"].append(element) + self.stack.append(element) - def handle_endtag(self, tag): - '''Inherited - called when an end tag is found''' - if tag in PageParser.banlist(): - return - self.stack.pop() + def handle_endtag(self, tag): + """Inherited - called when an end tag is found""" + if tag in PageParser.banlist(): + return + self.stack.pop() - - def handle_data(self, data): - '''Inherited - called when a data string is found''' - element = self.stack[-1] - element['data'] += data + def handle_data(self, data): + """Inherited - called when a data string is found""" + element = self.stack[-1] + element["data"] += data diff --git a/src/acquisition/twtr/twitter_update.py b/src/acquisition/twtr/twitter_update.py index 5c1f3f45b..4354c5a80 100644 --- a/src/acquisition/twtr/twitter_update.py +++ b/src/acquisition/twtr/twitter_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -49,7 +49,7 @@ * Small documentation update 2015-05-22 * Original version -''' +""" # third party import mysql.connector @@ -60,46 +60,55 @@ def run(): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `twitter`') - for (num,) in cur: - pass - return num - - # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) - cur.execute('SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`') - for (date1, date2) in cur: - date1, date2 = date1.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d') - print('Checking dates between %s and %s...'%(date1, date2)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check healthtweets.org for new and/or revised data - ht = HealthTweets(*secrets.healthtweets.login) - sql = 'INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s' - total_rows = 0 - for state in sorted(HealthTweets.STATE_CODES.keys()): - values = ht.get_values(state, date1, date2) - for date in sorted(list(values.keys())): - sql_data = (date, state, values[date]['num'], values[date]['total'], values[date]['num'], values[date]['total']) - cur.execute(sql, sql_data) - total_rows += 1 - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `twitter`") + for (num,) in cur: + pass + return num + + # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) + cur.execute( + "SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`" + ) + for (date1, date2) in cur: + date1, date2 = date1.strftime("%Y-%m-%d"), date2.strftime("%Y-%m-%d") + print(f"Checking dates between {date1} and {date2}...") + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check healthtweets.org for new and/or revised data + ht = HealthTweets(*secrets.healthtweets.login) + sql = "INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s" + total_rows = 0 + for state in sorted(HealthTweets.STATE_CODES.keys()): + values = ht.get_values(state, date1, date2) + for date in sorted(list(values.keys())): + sql_data = ( + date, + state, + values[date]["num"], + values[date]["total"], + values[date]["num"], + values[date]["total"], + ) + cur.execute(sql, sql_data) + total_rows += 1 + + # keep track of how many rows were added + rows_after = get_num_rows() + print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() From 923852eafa86b8f8b182d499489249ba8f815843 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:09:21 -0700 Subject: [PATCH 29/43] style(black): format wiki acquisition --- src/acquisition/wiki/wiki.py | 246 +++++++------- src/acquisition/wiki/wiki_download.py | 470 ++++++++++++++------------ src/acquisition/wiki/wiki_extract.py | 140 ++++---- src/acquisition/wiki/wiki_update.py | 149 ++++---- src/acquisition/wiki/wiki_util.py | 275 ++++++++------- 5 files changed, 674 insertions(+), 606 deletions(-) diff --git a/src/acquisition/wiki/wiki.py b/src/acquisition/wiki/wiki.py index 602e21102..c57582918 100644 --- a/src/acquisition/wiki/wiki.py +++ b/src/acquisition/wiki/wiki.py @@ -1,112 +1,112 @@ """ -=============== -=== Purpose === -=============== - -Wrapper for the entire wiki data collection process: - 1. Uses wiki_update.py to fetch metadata for new access logs - 2. Uses wiki_download.py to download the access logs - 3. Uses wiki_extract.py to store article access counts - +=============== +=== Purpose === +=============== + +Wrapper for the entire wiki data collection process: + 1. Uses wiki_update.py to fetch metadata for new access logs + 2. Uses wiki_download.py to download the access logs + 3. Uses wiki_extract.py to store article access counts + See also: master.php - - -======================= -=== Data Dictionary === -======================= - -`wiki_raw` is a staging table where extracted access log data is stored for -further processing. When wiki_update.py finds a new log, it saves the name and -hash to this table, with a status of 0. This table is read by master.php, which -then hands out "jobs" (independently and in parallel) to wiki_download.py. -After wiki_download.py downloads the log and extracts the counts, it submits -the data (as JSON) to master.php, which then stores the "raw" JSON counts in -this table. -+----------+---------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+---------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| name | varchar(64) | NO | UNI | NULL | | -| hash | char(32) | NO | | NULL | | -| status | int(11) | NO | MUL | 0 | | -| size | int(11) | YES | | NULL | | -| datetime | datetime | YES | | NULL | | -| worker | varchar(256) | YES | | NULL | | -| elapsed | float | YES | | NULL | | -| data | varchar(2048) | YES | | NULL | | -+----------+---------------+------+-----+---------+----------------+ -id: unique identifier for each record -name: name of the access log -hash: md5 hash of the file, as reported by the dumps site (all zeroes if no - hash is provided) -status: the status of the job, using the following values: - 0: queued for download - 1: download in progress - 2: queued for extraction - 3: extracted to `wiki` table - (any negative value indicates failure) -size: the size, in bytes, of the downloaded file -datetime: the timestamp of the most recent status update -worker: name (user@hostname) of the machine working on the job -elapsed: time, in seconds, taken to complete the job -data: a JSON string containing counts for selected articles in the access log - -`wiki` is the table where access counts are stored (parsed from wiki_raw). The -"raw" JSON counts are parsed by wiki_extract.py and stored directly in this -table. -+----------+-------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+-------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| datetime | datetime | NO | MUL | NULL | | -| article | varchar(64) | NO | MUL | NULL | | -| count | int(11) | NO | | NULL | | -+----------+-------------+------+-----+---------+----------------+ -id: unique identifier for each record -datetime: UTC timestamp (rounded to the nearest hour) of article access -article: name of the article -count: number of times the article was accessed in the hour - -`wiki_meta` is a metadata table for this dataset. It contains pre-calculated -date and epiweeks fields, and more importantly, the total number of English -article hits (denominator) for each `datetime` in the `wiki` table. This table -is populated in parallel with `wiki` by the wiki_extract.py script. -+----------+----------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+----------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| datetime | datetime | NO | UNI | NULL | | -| date | date | NO | | NULL | | -| epiweek | int(11) | NO | | NULL | | -| total | int(11) | NO | | NULL | | -+----------+----------+------+-----+---------+----------------+ -id: unique identifier for each record -datetime: UTC timestamp (rounded to the nearest hour) of article access -date: the date portion of `datetime` -epiweek: the year and week containing `datetime` -total: total number of English article hits in the hour - - -================= -=== Changelog === -================= - + + +======================= +=== Data Dictionary === +======================= + +`wiki_raw` is a staging table where extracted access log data is stored for +further processing. When wiki_update.py finds a new log, it saves the name and +hash to this table, with a status of 0. This table is read by master.php, which +then hands out "jobs" (independently and in parallel) to wiki_download.py. +After wiki_download.py downloads the log and extracts the counts, it submits +the data (as JSON) to master.php, which then stores the "raw" JSON counts in +this table. ++----------+---------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+---------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| name | varchar(64) | NO | UNI | NULL | | +| hash | char(32) | NO | | NULL | | +| status | int(11) | NO | MUL | 0 | | +| size | int(11) | YES | | NULL | | +| datetime | datetime | YES | | NULL | | +| worker | varchar(256) | YES | | NULL | | +| elapsed | float | YES | | NULL | | +| data | varchar(2048) | YES | | NULL | | ++----------+---------------+------+-----+---------+----------------+ +id: unique identifier for each record +name: name of the access log +hash: md5 hash of the file, as reported by the dumps site (all zeroes if no + hash is provided) +status: the status of the job, using the following values: + 0: queued for download + 1: download in progress + 2: queued for extraction + 3: extracted to `wiki` table + (any negative value indicates failure) +size: the size, in bytes, of the downloaded file +datetime: the timestamp of the most recent status update +worker: name (user@hostname) of the machine working on the job +elapsed: time, in seconds, taken to complete the job +data: a JSON string containing counts for selected articles in the access log + +`wiki` is the table where access counts are stored (parsed from wiki_raw). The +"raw" JSON counts are parsed by wiki_extract.py and stored directly in this +table. ++----------+-------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+-------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| datetime | datetime | NO | MUL | NULL | | +| article | varchar(64) | NO | MUL | NULL | | +| count | int(11) | NO | | NULL | | ++----------+-------------+------+-----+---------+----------------+ +id: unique identifier for each record +datetime: UTC timestamp (rounded to the nearest hour) of article access +article: name of the article +count: number of times the article was accessed in the hour + +`wiki_meta` is a metadata table for this dataset. It contains pre-calculated +date and epiweeks fields, and more importantly, the total number of English +article hits (denominator) for each `datetime` in the `wiki` table. This table +is populated in parallel with `wiki` by the wiki_extract.py script. ++----------+----------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+----------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| datetime | datetime | NO | UNI | NULL | | +| date | date | NO | | NULL | | +| epiweek | int(11) | NO | | NULL | | +| total | int(11) | NO | | NULL | | ++----------+----------+------+-----+---------+----------------+ +id: unique identifier for each record +datetime: UTC timestamp (rounded to the nearest hour) of article access +date: the date portion of `datetime` +epiweek: the year and week containing `datetime` +total: total number of English article hits in the hour + + +================= +=== Changelog === +================= + 2017-02-24 * secrets and small improvements 2016-08-14 * Increased job limit (6 -> 12) (pageviews files are ~2x smaller) -2015-08-26 +2015-08-26 * Reduced job limit (8 -> 6) -2015-08-14 +2015-08-14 * Reduced job limit (10 -> 8) -2015-08-11 +2015-08-11 + New table `wiki_meta` -2015-05-22 +2015-05-22 * Updated status codes for `wiki_raw` table -2015-05-21 +2015-05-21 * Original version """ - + # first party from . import wiki_update from . import wiki_download @@ -115,31 +115,27 @@ def main(): - # step 1: find new access logs (aka "jobs") - print('looking for new jobs...') - try: - wiki_update.run() - except: - print('wiki_update failed') - - # step 2: run a few jobs - print('running jobs...') - try: - wiki_download.run( - secrets.wiki.hmac, - download_limit=1024 * 1024 * 1024, - job_limit=12 - ) - except: - print('wiki_download failed') - - # step 3: extract counts from the staging data - print('extracting counts...') - try: - wiki_extract.run(job_limit=100) - except: - print('wiki_extract failed') - - -if __name__ == '__main__': - main() + # step 1: find new access logs (aka "jobs") + print("looking for new jobs...") + try: + wiki_update.run() + except: + print("wiki_update failed") + + # step 2: run a few jobs + print("running jobs...") + try: + wiki_download.run(secrets.wiki.hmac, download_limit=1024 * 1024 * 1024, job_limit=12) + except: + print("wiki_download failed") + + # step 3: extract counts from the staging data + print("extracting counts...") + try: + wiki_extract.run(job_limit=100) + except: + print("wiki_extract failed") + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 1a01b7f8e..07cc7fdc1 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -27,16 +27,16 @@ """ # python 2 and 3 -from __future__ import print_function import sys + if sys.version_info.major == 2: - # python 2 libraries - from urllib import urlencode - from urllib2 import urlopen + # python 2 libraries + from urllib import urlencode + from urllib2 import urlopen else: - # python 3 libraries - from urllib.parse import urlencode - from urllib.request import urlopen + # python 3 libraries + from urllib.parse import urlencode + from urllib.request import urlopen # common libraries import argparse @@ -53,234 +53,274 @@ VERSION = 10 -MASTER_URL = 'https://delphi.cmu.edu/~automation/public/wiki/master.php' +MASTER_URL = "https://delphi.cmu.edu/~automation/public/wiki/master.php" + def text(data_string): - return str(data_string.decode('utf-8')) + return str(data_string.decode("utf-8")) def data(text_string): - if sys.version_info.major == 2: - return text_string - else: - return bytes(text_string, 'utf-8') + if sys.version_info.major == 2: + return text_string + else: + return bytes(text_string, "utf-8") def get_hmac_sha256(key, msg): - key_bytes, msg_bytes = key.encode('utf-8'), msg.encode('utf-8') - return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() + key_bytes, msg_bytes = key.encode("utf-8"), msg.encode("utf-8") + return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() def extract_article_counts(filename, language, articles, debug_mode): - """ - Support multiple languages ('en' | 'es' | 'pt') - Running time optimized to O(M), which means only need to scan the whole file once - :param filename: - :param language: Different languages such as 'en', 'es', and 'pt' - :param articles: - :param debug_mode: - :return: - """ - counts = {} - articles_set = set(map(lambda x: x.lower(), articles)) - total = 0 - with open(filename, "r", encoding="utf8") as f: - for line in f: - content = line.strip().split() - if len(content) != 4: - print('unexpected article format: {0}'.format(line)) - continue - article_title = content[1].lower() - article_count = int(content[2]) - if content[0] == language: - total += article_count - if content[0] == language and article_title in articles_set: - if debug_mode: - print("Find article {0}: {1}".format(article_title, line)) - counts[article_title] = article_count - if debug_mode: - print("Total number of counts for language {0} is {1}".format(language, total)) - counts['total'] = total - return counts + """ + Support multiple languages ('en' | 'es' | 'pt') + Running time optimized to O(M), which means only need to scan the whole file once + :param filename: + :param language: Different languages such as 'en', 'es', and 'pt' + :param articles: + :param debug_mode: + :return: + """ + counts = {} + articles_set = set(map(lambda x: x.lower(), articles)) + total = 0 + with open(filename, encoding="utf8") as f: + for line in f: + content = line.strip().split() + if len(content) != 4: + print(f"unexpected article format: {line}") + continue + article_title = content[1].lower() + article_count = int(content[2]) + if content[0] == language: + total += article_count + if content[0] == language and article_title in articles_set: + if debug_mode: + print(f"Find article {article_title}: {line}") + counts[article_title] = article_count + if debug_mode: + print(f"Total number of counts for language {language} is {total}") + counts["total"] = total + return counts def extract_article_counts_orig(articles, debug_mode): - """ - The original method which extracts article counts by shell command grep (only support en articles). - As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. - Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), - where N is the number of articles and M is the lines in the file - In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding - :param articles: - :param debug_mode: - :return: - """ - counts = {} - for article in articles: - if debug_mode: - print(' %s' % (article)) + """ + The original method which extracts article counts by shell command grep (only support en articles). + As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. + Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), + where N is the number of articles and M is the lines in the file + In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding + :param articles: + :param debug_mode: + :return: + """ + counts = {} + for article in articles: + if debug_mode: + print(" %s" % (article)) + out = text( + subprocess.check_output( + 'LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True + ) + ).strip() + count = 0 + if len(out) > 0: + for line in out.split("\n"): + fields = line.split() + if len(fields) != 4: + print("unexpected article format: [%s]" % (line)) + else: + count += int(fields[2]) + # print ' %4d %s'%(count, article) + counts[article.lower()] = count + if debug_mode: + print(" %d" % (count)) + print("getting total count...") out = text( - subprocess.check_output('LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True)).strip() - count = 0 - if len(out) > 0: - for line in out.split('\n'): - fields = line.split() - if len(fields) != 4: - print('unexpected article format: [%s]' % (line)) - else: - count += int(fields[2]) - # print ' %4d %s'%(count, article) - counts[article.lower()] = count + subprocess.check_output( + 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', + shell=True, + ) + ) + total = int(out) if debug_mode: - print(' %d' % (count)) - print('getting total count...') - out = text(subprocess.check_output( - 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', shell=True)) - total = int(out) - if debug_mode: - print(total) - counts['total'] = total - return counts + print(total) + counts["total"] = total + return counts def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, debug_mode=False): - worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() - print('this is [%s]'%(worker)) - if debug_mode: - print('*** running in debug mode ***') - - total_download = 0 - passed_jobs = 0 - failed_jobs = 0 - while (download_limit is None or total_download < download_limit) and (job_limit is None or (passed_jobs + failed_jobs) < job_limit): - try: - time_start = datetime.datetime.now() - req = urlopen(MASTER_URL + '?get=x&type=%s'%(job_type)) - code = req.getcode() - if code != 200: - if code == 201: - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - else: - raise Exception('server response code (get) was %d'%(code)) - # Make the code compatible with mac os system - if platform == "darwin": - job_content = text(req.readlines()[1]) - else: - job_content = text(req.readlines()[0]) - if job_content == 'no jobs': - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - job = json.loads(job_content) - print('received job [%d|%s]'%(job['id'], job['name'])) - # updated parsing for pageviews - maybe use a regex in the future - #year, month = int(job['name'][11:15]), int(job['name'][15:17]) - year, month = int(job['name'][10:14]), int(job['name'][14:16]) - #print 'year=%d | month=%d'%(year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s'%(year, year, month, job['name']) - print('downloading file [%s]...'%(url)) - subprocess.check_call('curl -s %s > raw.gz'%(url), shell=True) - print('checking file size...') - # Make the code cross-platfrom, so use python to get the size of the file - # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) - size = os.stat("raw.gz").st_size - if debug_mode: - print(size) - total_download += size - if job['hash'] != '00000000000000000000000000000000': - print('checking hash...') - out = text(subprocess.check_output('md5sum raw.gz', shell=True)) - result = out[0:32] - if result != job['hash']: - raise Exception('wrong hash [expected %s, got %s]'%(job['hash'], result)) - if debug_mode: - print(result) - print('decompressing...') - subprocess.check_call('gunzip -f raw.gz', shell=True) - #print 'converting case...' - #subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) - #subprocess.check_call('rm raw', shell=True) - subprocess.check_call('mv raw raw2', shell=True) - print('extracting article counts...') - - # Use python to read the file and extract counts, if you want to use the original shell method, please use - counts = {} - for language in wiki_util.Articles.available_languages: - lang2articles = {'en': wiki_util.Articles.en_articles, 'es': wiki_util.Articles.es_articles, 'pt': wiki_util.Articles.pt_articles} - articles = lang2articles[language] - articles = sorted(articles) - if debug_mode: - print("Language is {0} and target articles are {1}".format(language, articles)) - temp_counts = extract_article_counts("raw2", language, articles, debug_mode) - counts[language] = temp_counts - - if not debug_mode: - print('deleting files...') - subprocess.check_call('rm raw2', shell=True) - print('saving results...') - time_stop = datetime.datetime.now() - result = { - 'id': job['id'], - 'size': size, - 'data': json.dumps(counts), - 'worker': worker, - 'elapsed': (time_stop - time_start).total_seconds(), - } - payload = json.dumps(result) - hmac_str = get_hmac_sha256(secret, payload) - if debug_mode: - print(' hmac: %s' % hmac_str) - post_data = urlencode({'put': payload, 'hmac': hmac_str}) - req = urlopen(MASTER_URL, data=data(post_data)) - code = req.getcode() - if code != 200: - raise Exception('server response code (put) was %d'%(code)) - print('done! (dl=%d)'%(total_download)) - passed_jobs += 1 - except Exception as ex: - print('***** Caught Exception: %s *****'%(str(ex))) - failed_jobs += 1 - time.sleep(30) - print('passed=%d | failed=%d | total=%d'%(passed_jobs, failed_jobs, passed_jobs + failed_jobs)) - time.sleep(sleep_time) - - if download_limit is not None and total_download >= download_limit: - print('download limit has been reached [%d >= %d]'%(total_download, download_limit)) - if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: - print('job limit has been reached [%d >= %d]'%(passed_jobs + failed_jobs, job_limit)) + worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() + print("this is [%s]" % (worker)) + if debug_mode: + print("*** running in debug mode ***") + + total_download = 0 + passed_jobs = 0 + failed_jobs = 0 + while (download_limit is None or total_download < download_limit) and ( + job_limit is None or (passed_jobs + failed_jobs) < job_limit + ): + try: + time_start = datetime.datetime.now() + req = urlopen(MASTER_URL + "?get=x&type=%s" % (job_type)) + code = req.getcode() + if code != 200: + if code == 201: + print("no jobs available") + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print("nothing to do, exiting") + return + else: + raise Exception("server response code (get) was %d" % (code)) + # Make the code compatible with mac os system + if platform == "darwin": + job_content = text(req.readlines()[1]) + else: + job_content = text(req.readlines()[0]) + if job_content == "no jobs": + print("no jobs available") + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print("nothing to do, exiting") + return + job = json.loads(job_content) + print("received job [%d|%s]" % (job["id"], job["name"])) + # updated parsing for pageviews - maybe use a regex in the future + # year, month = int(job['name'][11:15]), int(job['name'][15:17]) + year, month = int(job["name"][10:14]), int(job["name"][14:16]) + # print 'year=%d | month=%d'%(year, month) + url = "https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s" % ( + year, + year, + month, + job["name"], + ) + print("downloading file [%s]..." % (url)) + subprocess.check_call("curl -s %s > raw.gz" % (url), shell=True) + print("checking file size...") + # Make the code cross-platfrom, so use python to get the size of the file + # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) + size = os.stat("raw.gz").st_size + if debug_mode: + print(size) + total_download += size + if job["hash"] != "00000000000000000000000000000000": + print("checking hash...") + out = text(subprocess.check_output("md5sum raw.gz", shell=True)) + result = out[0:32] + if result != job["hash"]: + raise Exception(f"wrong hash [expected {job['hash']}, got {result}]") + if debug_mode: + print(result) + print("decompressing...") + subprocess.check_call("gunzip -f raw.gz", shell=True) + # print 'converting case...' + # subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) + # subprocess.check_call('rm raw', shell=True) + subprocess.check_call("mv raw raw2", shell=True) + print("extracting article counts...") + + # Use python to read the file and extract counts, if you want to use the original shell method, please use + counts = {} + for language in wiki_util.Articles.available_languages: + lang2articles = { + "en": wiki_util.Articles.en_articles, + "es": wiki_util.Articles.es_articles, + "pt": wiki_util.Articles.pt_articles, + } + articles = lang2articles[language] + articles = sorted(articles) + if debug_mode: + print(f"Language is {language} and target articles are {articles}") + temp_counts = extract_article_counts("raw2", language, articles, debug_mode) + counts[language] = temp_counts + + if not debug_mode: + print("deleting files...") + subprocess.check_call("rm raw2", shell=True) + print("saving results...") + time_stop = datetime.datetime.now() + result = { + "id": job["id"], + "size": size, + "data": json.dumps(counts), + "worker": worker, + "elapsed": (time_stop - time_start).total_seconds(), + } + payload = json.dumps(result) + hmac_str = get_hmac_sha256(secret, payload) + if debug_mode: + print(" hmac: %s" % hmac_str) + post_data = urlencode({"put": payload, "hmac": hmac_str}) + req = urlopen(MASTER_URL, data=data(post_data)) + code = req.getcode() + if code != 200: + raise Exception("server response code (put) was %d" % (code)) + print("done! (dl=%d)" % (total_download)) + passed_jobs += 1 + except Exception as ex: + print("***** Caught Exception: %s *****" % (str(ex))) + failed_jobs += 1 + time.sleep(30) + print( + "passed=%d | failed=%d | total=%d" + % (passed_jobs, failed_jobs, passed_jobs + failed_jobs) + ) + time.sleep(sleep_time) + + if download_limit is not None and total_download >= download_limit: + print("download limit has been reached [%d >= %d]" % (total_download, download_limit)) + if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: + print("job limit has been reached [%d >= %d]" % (passed_jobs + failed_jobs, job_limit)) def main(): - # version info - print('version', VERSION) - - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('secret', type=str, help='hmac secret key') - parser.add_argument('-b', '--blimit', action='store', type=int, default=None, help='download limit, in bytes') - parser.add_argument('-j', '--jlimit', action='store', type=int, default=None, help='job limit') - parser.add_argument('-s', '--sleep', action='store', type=int, default=1, help='seconds to sleep between each job') - parser.add_argument('-t', '--type', action='store', type=int, default=0, help='type of job') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() - - # runtime options - secret, download_limit, job_limit, sleep_time, job_type, debug_mode = args.secret, args.blimit, args.jlimit, args.sleep, args.type, args.debug - - # run - run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) - - -if __name__ == '__main__': - main() + # version info + print("version", VERSION) + + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("secret", type=str, help="hmac secret key") + parser.add_argument( + "-b", "--blimit", action="store", type=int, default=None, help="download limit, in bytes" + ) + parser.add_argument("-j", "--jlimit", action="store", type=int, default=None, help="job limit") + parser.add_argument( + "-s", + "--sleep", + action="store", + type=int, + default=1, + help="seconds to sleep between each job", + ) + parser.add_argument("-t", "--type", action="store", type=int, default=0, help="type of job") + parser.add_argument( + "-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode" + ) + args = parser.parse_args() + + # runtime options + secret, download_limit, job_limit, sleep_time, job_type, debug_mode = ( + args.secret, + args.blimit, + args.jlimit, + args.sleep, + args.type, + args.debug, + ) + + # run + run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/wiki/wiki_extract.py b/src/acquisition/wiki/wiki_extract.py index 839d7d6dc..f4e0efb96 100644 --- a/src/acquisition/wiki/wiki_extract.py +++ b/src/acquisition/wiki/wiki_extract.py @@ -35,74 +35,96 @@ def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime( + int(name[10:14]), + int(name[14:16]), + int(name[16:18]), + int(name[19:21]), + int(name[21:23]), + int(name[23:25]), + ) def run(job_limit=100): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently - # cur.execute("SET NAMES utf8;") - # cur.execute("SET CHARACTER SET utf8;") - # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer - # cur.execute("SET character_set_client=utf8mb4;") - # cur.execute("SET character_set_connection=utf8mb4;") - # cur.execute("SET character_set_database=utf8;") - # cur.execute("SET character_set_results=utf8mb4;") - # cur.execute("SET character_set_server=utf8;") - # cur.execute("SET collation_connection=utf8mb4_general_ci;") - # cur.execute("SET collation_database=utf8_general_ci;") - # cur.execute("SET collation_server=utf8_general_ci;") - - # find jobs that are queued for extraction - cur.execute('SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s', (job_limit,)) - jobs = [] - for (id, name, data_str) in cur: - jobs.append((id, name, json.loads(data_str))) - print('Processing data from %d jobs'%(len(jobs))) - - # get the counts from the json object and insert into (or update) the database - # Notice that data_collect contains data with different languages - for (id, name, data_collect) in jobs: - print('processing job [%d|%s]...'%(id, name)) - timestamp = round_timestamp(get_timestamp(name)) - for language in data_collect.keys(): - data = data_collect[language] - for article in sorted(data.keys()): - count = data[article] - cur.execute('INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s', (str(timestamp), article.encode('utf-8').decode('latin-1'), count, language, count)) - if article == 'total': - cur.execute('INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s', (str(timestamp), str(timestamp), str(timestamp), count, language, count)) - # update the job - cur.execute('UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s', (id,)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently + # cur.execute("SET NAMES utf8;") + # cur.execute("SET CHARACTER SET utf8;") + # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer + # cur.execute("SET character_set_client=utf8mb4;") + # cur.execute("SET character_set_connection=utf8mb4;") + # cur.execute("SET character_set_database=utf8;") + # cur.execute("SET character_set_results=utf8mb4;") + # cur.execute("SET character_set_server=utf8;") + # cur.execute("SET collation_connection=utf8mb4_general_ci;") + # cur.execute("SET collation_database=utf8_general_ci;") + # cur.execute("SET collation_server=utf8_general_ci;") + + # find jobs that are queued for extraction + cur.execute( + "SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s", + (job_limit,), + ) + jobs = [] + for (id, name, data_str) in cur: + jobs.append((id, name, json.loads(data_str))) + print("Processing data from %d jobs" % (len(jobs))) + + # get the counts from the json object and insert into (or update) the database + # Notice that data_collect contains data with different languages + for (id, name, data_collect) in jobs: + print("processing job [%d|%s]..." % (id, name)) + timestamp = round_timestamp(get_timestamp(name)) + for language in data_collect.keys(): + data = data_collect[language] + for article in sorted(data.keys()): + count = data[article] + cur.execute( + "INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s", + ( + str(timestamp), + article.encode("utf-8").decode("latin-1"), + count, + language, + count, + ), + ) + if article == "total": + cur.execute( + "INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s", + (str(timestamp), str(timestamp), str(timestamp), count, language, count), + ) + # update the job + cur.execute("UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s", (id,)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki_update.py b/src/acquisition/wiki/wiki_update.py index 411544810..c9aa6d6a2 100644 --- a/src/acquisition/wiki/wiki_update.py +++ b/src/acquisition/wiki/wiki_update.py @@ -32,87 +32,100 @@ def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # If the program is cold start (there are no previous names in the table, and the name will be None) - if name is None: - curr = datetime.now() - return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # If the program is cold start (there are no previous names in the table, and the name will be None) + if name is None: + curr = datetime.now() + return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime( + int(name[10:14]), + int(name[14:16]), + int(name[16:18]), + int(name[19:21]), + int(name[21:23]), + int(name[23:25]), + ) def get_manifest(year, month, optional=False): - # unlike pagecounts-raw, pageviews doesn't provide hashes - #url = 'https://dumps.wikimedia.org/other/pagecounts-raw/%d/%d-%02d/md5sums.txt'%(year, year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/' % (year, year, month) - print('Checking manifest at %s...'%(url)) - response = requests.get(url) - if response.status_code == 200: - #manifest = [line.strip().split() for line in response.text.split('\n') if 'pagecounts' in line] - manifest = [('00000000000000000000000000000000', line[9:37]) for line in response.text.split('\n') if ' max_name: - new_logs[name] = hash - print(' New job: %s [%s]'%(name, hash)) - print('Found %d new job(s)'%(len(new_logs))) - - # store metadata for new jobs - for name in sorted(new_logs.keys()): - cur.execute('INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)', (name, new_logs[name])) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # get the most recent job in wiki_raw + # luckily, "pageviews" is lexicographically greater than "pagecounts-raw" + cur.execute("SELECT max(`name`) FROM `wiki_raw`") + for (max_name,) in cur: + pass + print("Last known file: %s" % (max_name)) + timestamp = get_timestamp(max_name) + + # crawl dumps.wikimedia.org to find more recent access logs + t1, t2 = floor_timestamp(timestamp), ceil_timestamp(timestamp) + manifest = get_manifest(t1.year, t1.month, optional=False) + if t2.month != t1.month: + manifest += get_manifest(t2.year, t2.month, optional=True) + + # find access logs newer than the most recent job + new_logs = {} + for (hash, name) in manifest: + if max_name is None or name > max_name: + new_logs[name] = hash + print(f" New job: {name} [{hash}]") + print("Found %d new job(s)" % (len(new_logs))) + + # store metadata for new jobs + for name in sorted(new_logs.keys()): + cur.execute( + "INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)", (name, new_logs[name]) + ) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki_util.py b/src/acquisition/wiki/wiki_util.py index ed3c743bc..55bf3e2ca 100644 --- a/src/acquisition/wiki/wiki_util.py +++ b/src/acquisition/wiki/wiki_util.py @@ -1,159 +1,156 @@ - - - class Articles: # Notice that all languages must be two chars, because that `language` column in table `wiki` is CHAR(2) - available_languages = ['en', 'es', 'pt'] + available_languages = ["en", "es", "pt"] en_articles_flu = [ - 'Influenza_B_virus', - 'Influenza_A_virus', - 'Human_flu', - 'Influenzavirus_C', - 'Oseltamivir', - 'Influenza', - 'Influenzavirus_A', - 'Influenza_A_virus_subtype_H1N1', - 'Zanamivir', - 'Influenza-like_illness', - 'Common_cold', - 'Sore_throat', - 'Flu_season', - 'Chills', - 'Fever', - 'Influenza_A_virus_subtype_H2N2', - 'Swine_influenza', - 'Shivering', - 'Canine_influenza', - 'Influenza_A_virus_subtype_H3N2', - 'Neuraminidase_inhibitor', - 'Influenza_pandemic', - 'Viral_pneumonia', - 'Influenza_prevention', - 'Influenza_A_virus_subtype_H1N2', - 'Rhinorrhea', - 'Orthomyxoviridae', - 'Nasal_congestion', - 'Gastroenteritis', - 'Rimantadine', - 'Paracetamol', - 'Amantadine', - 'Viral_neuraminidase', - 'Headache', - 'Influenza_vaccine', - 'Vomiting', - 'Cough', - 'Influenza_A_virus_subtype_H5N1', - 'Nausea', - 'Avian_influenza', - 'Influenza_A_virus_subtype_H7N9', - 'Influenza_A_virus_subtype_H10N7', - 'Influenza_A_virus_subtype_H9N2', - 'Hemagglutinin_(influenza)', - 'Influenza_A_virus_subtype_H7N7', - 'Fatigue_(medical)', - 'Myalgia', - 'Influenza_A_virus_subtype_H7N3', - 'Malaise', - 'Equine_influenza', - 'Cat_flu', - 'Influenza_A_virus_subtype_H3N8', - 'Antiviral_drugs', - 'Influenza_A_virus_subtype_H7N2', + "Influenza_B_virus", + "Influenza_A_virus", + "Human_flu", + "Influenzavirus_C", + "Oseltamivir", + "Influenza", + "Influenzavirus_A", + "Influenza_A_virus_subtype_H1N1", + "Zanamivir", + "Influenza-like_illness", + "Common_cold", + "Sore_throat", + "Flu_season", + "Chills", + "Fever", + "Influenza_A_virus_subtype_H2N2", + "Swine_influenza", + "Shivering", + "Canine_influenza", + "Influenza_A_virus_subtype_H3N2", + "Neuraminidase_inhibitor", + "Influenza_pandemic", + "Viral_pneumonia", + "Influenza_prevention", + "Influenza_A_virus_subtype_H1N2", + "Rhinorrhea", + "Orthomyxoviridae", + "Nasal_congestion", + "Gastroenteritis", + "Rimantadine", + "Paracetamol", + "Amantadine", + "Viral_neuraminidase", + "Headache", + "Influenza_vaccine", + "Vomiting", + "Cough", + "Influenza_A_virus_subtype_H5N1", + "Nausea", + "Avian_influenza", + "Influenza_A_virus_subtype_H7N9", + "Influenza_A_virus_subtype_H10N7", + "Influenza_A_virus_subtype_H9N2", + "Hemagglutinin_(influenza)", + "Influenza_A_virus_subtype_H7N7", + "Fatigue_(medical)", + "Myalgia", + "Influenza_A_virus_subtype_H7N3", + "Malaise", + "Equine_influenza", + "Cat_flu", + "Influenza_A_virus_subtype_H3N8", + "Antiviral_drugs", + "Influenza_A_virus_subtype_H7N2", ] en_articles_noro = [ - 'Norovirus', - 'Diarrhea', - 'Dehydration', - 'Gastroenteritis', - 'Vomiting', - 'Abdominal_pain', - 'Nausea', - 'Foodborne_illness', - 'Rotavirus', - 'Fecal–oral_route', - 'Intravenous_therapy', - 'Oral_rehydration_therapy', - 'Shellfish', - 'Caliciviridae', - 'Leaky_scanning', + "Norovirus", + "Diarrhea", + "Dehydration", + "Gastroenteritis", + "Vomiting", + "Abdominal_pain", + "Nausea", + "Foodborne_illness", + "Rotavirus", + "Fecal–oral_route", + "Intravenous_therapy", + "Oral_rehydration_therapy", + "Shellfish", + "Caliciviridae", + "Leaky_scanning", ] en_articles_dengue = [ - 'Dengue_fever', - 'Dengue_virus', - 'Aedes', - 'Aedes_aegypti', - 'Dengue_vaccine', - 'Mosquito', - 'Mosquito-borne_disease', - 'Blood_transfusion', - 'Paracetamol', - 'Fever', - 'Headache', - 'Rhinitis', - 'Flavivirus', - 'Exanthem', - 'Myalgia', - 'Arthralgia', - 'Thrombocytopenia', - 'Hematuria', - 'Nosebleed', - 'Petechia', - 'Nausea', - 'Vomiting', - 'Diarrhea', + "Dengue_fever", + "Dengue_virus", + "Aedes", + "Aedes_aegypti", + "Dengue_vaccine", + "Mosquito", + "Mosquito-borne_disease", + "Blood_transfusion", + "Paracetamol", + "Fever", + "Headache", + "Rhinitis", + "Flavivirus", + "Exanthem", + "Myalgia", + "Arthralgia", + "Thrombocytopenia", + "Hematuria", + "Nosebleed", + "Petechia", + "Nausea", + "Vomiting", + "Diarrhea", ] en_articles = list(set(en_articles_flu + en_articles_noro + en_articles_dengue)) es_articles = [ - 'Dengue', - 'Virus_dengue', - 'Aedes', - 'Aedes_aegypti', - 'Culicidae', - 'Transfusión_de_sangre', - 'Paracetamol', - 'Fiebre', - 'Cefalea', - 'Coriza', - 'Flavivirus', - 'Exantema', - 'Mosquito', - 'Mialgia', - 'Artralgia', - 'Trombocitopenia', - 'Hematuria', - 'Epistaxis', - 'Petequia', - 'Náusea', - 'Vómito', - 'Diarrea', + "Dengue", + "Virus_dengue", + "Aedes", + "Aedes_aegypti", + "Culicidae", + "Transfusión_de_sangre", + "Paracetamol", + "Fiebre", + "Cefalea", + "Coriza", + "Flavivirus", + "Exantema", + "Mosquito", + "Mialgia", + "Artralgia", + "Trombocitopenia", + "Hematuria", + "Epistaxis", + "Petequia", + "Náusea", + "Vómito", + "Diarrea", ] pt_articles = [ - 'Dengue', - 'Vírus_da_dengue', - 'Aedes', - 'Aedes_aegypti', - 'Culicidae', - 'Transfusão_de_sangue', - 'Paracetamol', - 'Febre', - 'Cefaleia', - 'Coriza', - 'Flavivírus', - 'Exantema', - 'Mialgia', - 'Artralgia', - 'Trombocitopenia', - 'Hematúria', - 'Epistaxe', - 'Petéquia', - 'Náusea', - 'Vômito', - 'Diarreia', + "Dengue", + "Vírus_da_dengue", + "Aedes", + "Aedes_aegypti", + "Culicidae", + "Transfusão_de_sangue", + "Paracetamol", + "Febre", + "Cefaleia", + "Coriza", + "Flavivírus", + "Exantema", + "Mialgia", + "Artralgia", + "Trombocitopenia", + "Hematúria", + "Epistaxe", + "Petéquia", + "Náusea", + "Vômito", + "Diarreia", ] From c827e54de344ef54f115bc3bc8083713e835c059 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:09:56 -0700 Subject: [PATCH 30/43] ci(sonar): tempfiles for security warnings --- src/acquisition/ecdc/ecdc_db_update.py | 38 ++++++++++----------- src/acquisition/paho/paho_db_update.py | 46 ++++++++++++-------------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 6e0083ecc..86e3b1cd8 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -183,27 +183,23 @@ def main(): max_tries = 5 while flag < max_tries: flag = flag + 1 - tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) - tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) - # Use temporary directory to avoid data from different time - # downloaded to same folder - download_ecdc_data(download_dir=tmp_dir) - issue = EpiDate.today().get_ew() - files = glob.glob('%s/*.csv' % tmp_dir) - for filename in files: - with open(filename,'r') as f: - _ = f.readline() - db_error = False - for filename in files: - try: - update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: - db_error = True - subprocess.call(["rm","-r",tmp_dir]) - if not db_error: - break # Exit loop with success + with tempfile.TemporaryDirectory() as tmp_dir: + # Use temporary directory to avoid data from different time + # downloaded to same folder + download_ecdc_data(download_dir=tmp_dir) + issue = EpiDate.today().get_ew() + files = glob.glob(f"{tmp_dir}/*.csv") + for filename in files: + with open(filename) as f: + _ = f.readline() + db_error = False + for filename in files: + try: + update_from_file(issue, date, filename, test_mode=args.test) + except: + db_error = True + if not db_error: + break # Exit loop with success if flag >= max_tries: print("WARNING: Database `ecdc_ili` did not update successfully") diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index 08577f580..67fbc1d28 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -261,32 +261,28 @@ def main(): max_tries = 5 while flag < max_tries: flag = flag + 1 - tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) - tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) - # Use temporary directory to avoid data from different time - # downloaded to same folder - get_paho_data(dir=tmp_dir) - issue = EpiDate.today().get_ew() - # Check to make sure we downloaded a file for every week - issueset = set() - files = glob.glob('%s/*.csv' % tmp_dir) - for filename in files: - with open(filename,'r') as f: - _ = f.readline() - data = f.readline().split(',') - issueset.add(data[6]) - db_error = False - if len(issueset) >= 53: # Shouldn't be more than 53 + with tempfile.TemporaryDirectory() as tmp_dir: + # Use temporary directory to avoid data from different time + # downloaded to same folder + get_paho_data(dir=tmp_dir) + issue = EpiDate.today().get_ew() + # Check to make sure we downloaded a file for every week + issueset = set() + files = glob.glob(f"{tmp_dir}/*.csv") for filename in files: - try: - update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: - db_error = True - subprocess.call(["rm","-r",tmp_dir]) - if not db_error: - break # Exit loop with success + with open(filename) as f: + _ = f.readline() + data = f.readline().split(",") + issueset.add(data[6]) + db_error = False + if len(issueset) >= 53: # Shouldn't be more than 53 + for filename in files: + try: + update_from_file(issue, date, filename, test_mode=args.test) + except: + db_error = True + if not db_error: + break # Exit loop with success if flag >= max_tries: print("WARNING: Database `paho_dengue` did not update successfully") From 76ddfbff9700f85ca145f7d1333bc00edc2da47d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 26 May 2023 19:53:22 -0700 Subject: [PATCH 31/43] style: add .editorconfig --- .editorconfig | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..8a80734f0 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# EditorConfig helps developers define and maintain consistent +# coding styles between different editors and IDEs +# editorconfig.org + +root = true + + +[*] + +# Change these settings to your own preference +indent_style = space +indent_size = 4 + +# We recommend you to keep these unchanged +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false From 145dd42fe7561bdcc38b40b3769bd840057e05f5 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 5 Jun 2023 15:48:57 -0700 Subject: [PATCH 32/43] style(pylint): add pylint config --- pyproject.toml | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d8589df09..a4399ca9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,26 @@ - [tool.black] line-length = 100 target-version = ['py38'] include = 'server,tests/server' + +[tool.pylint] + [tool.pylint.'MESSAGES CONTROL'] + max-line-length = 100 + disable = [ + 'logging-format-interpolation', + # Allow pytest functions to be part of a class + 'no-self-use', + 'too-many-locals', + 'too-many-arguments', + # Allow pytest classes to have one test + 'too-few-public-methods', + ] + + [tool.pylint.'BASIC'] + # Allow arbitrarily short-named variables. + variable-rgx = ['[a-z_][a-z0-9_]*'] + argument-rgx = [ '[a-z_][a-z0-9_]*' ] + attr-rgx = ['[a-z_][a-z0-9_]*'] + + [tool.pylint.'DESIGN'] + ignored-argument-names = ['(_.*|run_as_module)'] From 1e7319ef5ea075c666e75e4fb540fb4514ec4dcd Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 13:30:34 -0700 Subject: [PATCH 33/43] Update src/acquisition/cdcp/cdc_extract.py Co-authored-by: Katie Mazaitis --- src/acquisition/cdcp/cdc_extract.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index e4d7af573..b8f772684 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -173,22 +173,12 @@ def extract(first_week=None, last_week=None, test_mode=False): # update each state for state in states: try: - num1 = get_num_hits(cur, epiweek, state, pages[0]) - num2 = get_num_hits(cur, epiweek, state, pages[1]) - num3 = get_num_hits(cur, epiweek, state, pages[2]) - num4 = get_num_hits(cur, epiweek, state, pages[3]) - num5 = get_num_hits(cur, epiweek, state, pages[4]) - num6 = get_num_hits(cur, epiweek, state, pages[5]) - num7 = get_num_hits(cur, epiweek, state, pages[6]) - num8 = get_num_hits(cur, epiweek, state, pages[7]) + nums = [] + for i in range(8): + nums[i] = get_num_hits(cur, epiweek, state, pages[i]) total = get_total_hits(cur, epiweek, state) - store_result( - cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total - ) - print( - " %d-%s: %d %d %d %d %d %d %d %d (%d)" - % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) - ) + store_result(cur, epiweek, state, *nums, total) + print(f" {epiweek}-{state}: {' '.join(str(n) for n in nums)} ({total})") except Exception as ex: print(" %d-%s: failed" % (epiweek, state), ex) # raise ex From b00f11bc439da092d76cdaba92f48d328ef10844 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 14:18:04 -0700 Subject: [PATCH 34/43] style(black): add fmt off tags around parser.add_argument calls --- src/acquisition/ght/ght_update.py | 22 +++++---------------- src/acquisition/ght/google_health_trends.py | 6 +++++- src/acquisition/quidel/quidel_update.py | 14 ++++--------- src/acquisition/twtr/healthtweets.py | 22 ++++++--------------- src/acquisition/wiki/wiki_download.py | 19 +++++------------- 5 files changed, 25 insertions(+), 58 deletions(-) diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index 76046c5c4..7f65bbfe5 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -351,25 +351,13 @@ def get_num_rows(): def main(): # args and usage parser = argparse.ArgumentParser() - parser.add_argument( - "location", - action="store", - type=str, - default=None, - help="location(s) (ex: all; US; TX; CA,LA,WY)", - ) - parser.add_argument( - "term", - action="store", - type=str, - default=None, - help='term/query/topic (ex: all; /m/0cycc; "flu fever")', - ) + # fmt: off + parser.add_argument("location", action="store", type=str, default=None, help="location(s) (ex: all; US; TX; CA,LA,WY)") + parser.add_argument("term", action="store", type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") - parser.add_argument( - "--country", "-c", default="US", type=str, help="location country (ex: US; BR)" - ) + parser.add_argument("--country", "-c", default="US", type=str, help="location country (ex: US; BR)") + # fmt: on args = parser.parse_args() # sanity check diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 7fd95f9a4..69d751e95 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -114,7 +114,10 @@ def get_data(self, start_week, end_week, location, term, resolution="week", coun def main(): # args and usage parser = argparse.ArgumentParser() - parser.add_argument("apikey", action="store", type=str, default=None, help="API key") + # fmt: off + parser.add_argument( + "apikey", action="store", type=str, default=None, help="API key" + ) parser.add_argument( "startweek", action="store", type=int, default=None, help="first week (ex: 201440)" ) @@ -127,6 +130,7 @@ def main(): parser.add_argument( "term", action="store", type=str, default=None, help="term/query/topic (ex: /m/0cycc)" ) + # fmt: on args = parser.parse_args() # get the data diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index 286a30834..06f8b9da5 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -124,19 +124,13 @@ def get_num_rows(): def main(): # args and usage parser = argparse.ArgumentParser() - parser.add_argument( - "--location", - action="store", - type=str, - default=None, - help="location(s) (ex: all; any of hhs1-10)", - ) + # fmt: off + parser.add_argument("--location", action="store", type=str, default=None, help="location(s) (ex: all; any of hhs1-10)") parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") parser.add_argument("--force_update", "-u", action="store_true", help="force update db values") - parser.add_argument( - "--skip_email", "-s", action="store_true", help="skip email downloading step" - ) + parser.add_argument("--skip_email", "-s", action="store_true", help="skip email downloading step") + # fmt: on args = parser.parse_args() # sanity check diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index 31976f376..f64bbd689 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -216,24 +216,14 @@ def _go(self, url, method=None, referer=None, data=None): def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument("username", action="store", type=str, help="healthtweets.org username") parser.add_argument("password", action="store", type=str, help="healthtweets.org password") - parser.add_argument( - "state", - action="store", - type=str, - choices=list(HealthTweets.STATE_CODES.keys()), - help="U.S. state (ex: TX)", - ) - parser.add_argument( - "date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)" - ) - parser.add_argument( - "date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)" - ) - parser.add_argument( - "-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode" - ) + parser.add_argument("state", action="store", type=str, choices=list(HealthTweets.STATE_CODES.keys()), help="U.S. state (ex: TX)") + parser.add_argument("date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)") + parser.add_argument("date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)") + parser.add_argument("-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode") + # fmt: on args = parser.parse_args() ht = HealthTweets(args.username, args.password, debug=args.debug) diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 07cc7fdc1..c32fc87ed 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -289,23 +289,14 @@ def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument("secret", type=str, help="hmac secret key") - parser.add_argument( - "-b", "--blimit", action="store", type=int, default=None, help="download limit, in bytes" - ) + parser.add_argument("-b", "--blimit", action="store", type=int, default=None, help="download limit, in bytes") parser.add_argument("-j", "--jlimit", action="store", type=int, default=None, help="job limit") - parser.add_argument( - "-s", - "--sleep", - action="store", - type=int, - default=1, - help="seconds to sleep between each job", - ) + parser.add_argument("-s", "--sleep", action="store", type=int, default=1, help="seconds to sleep between each job") parser.add_argument("-t", "--type", action="store", type=int, default=0, help="type of job") - parser.add_argument( - "-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode" - ) + parser.add_argument("-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode") + # fmt: on args = parser.parse_args() # runtime options From dd1b08994278e7a4586d6af34028c2d3fff378b1 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 21 Jun 2023 15:24:32 -0700 Subject: [PATCH 35/43] style: update .editorconfig --- .editorconfig | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.editorconfig b/.editorconfig index 8a80734f0..b76cfd14a 100644 --- a/.editorconfig +++ b/.editorconfig @@ -4,18 +4,19 @@ root = true - [*] - -# Change these settings to your own preference -indent_style = space -indent_size = 4 - # We recommend you to keep these unchanged end_of_line = lf charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true + +[*.py] +# Change these settings to your own preference +indent_style = space +indent_size = 4 + + [*.md] trim_trailing_whitespace = false From 7a27a3a1256147ffcfcbfffbc80fd28f399b2752 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 23 Jun 2023 10:26:47 -0700 Subject: [PATCH 36/43] style(flynt): convert .format and % strings to f-strings --- src/acquisition/cdcp/cdc_dropbox_receiver.py | 2 +- src/acquisition/cdcp/cdc_extract.py | 6 +-- src/acquisition/cdcp/cdc_upload.py | 2 +- src/acquisition/ecdc/ecdc_db_update.py | 12 +++--- src/acquisition/flusurv/flusurv.py | 6 +-- src/acquisition/flusurv/flusurv_update.py | 6 +-- src/acquisition/fluview/fluview.py | 14 +++---- src/acquisition/fluview/fluview_update.py | 34 ++++++++-------- .../fluview/impute_missing_values.py | 8 ++-- src/acquisition/ght/ght_update.py | 12 +++--- src/acquisition/ght/google_health_trends.py | 4 +- src/acquisition/kcdc/kcdc_update.py | 8 ++-- src/acquisition/nidss/taiwan_nidss.py | 12 +++--- src/acquisition/nidss/taiwan_update.py | 8 ++-- src/acquisition/paho/paho_db_update.py | 14 +++---- src/acquisition/paho/paho_download.py | 14 +++---- src/acquisition/quidel/quidel_update.py | 6 +-- src/acquisition/twtr/healthtweets.py | 21 +++++----- src/acquisition/twtr/twitter_update.py | 2 +- src/acquisition/wiki/wiki_download.py | 40 +++++++++---------- src/acquisition/wiki/wiki_extract.py | 4 +- src/acquisition/wiki/wiki_update.py | 12 +++--- 22 files changed, 121 insertions(+), 126 deletions(-) diff --git a/src/acquisition/cdcp/cdc_dropbox_receiver.py b/src/acquisition/cdcp/cdc_dropbox_receiver.py index 65626101b..4fa20368e 100644 --- a/src/acquisition/cdcp/cdc_dropbox_receiver.py +++ b/src/acquisition/cdcp/cdc_dropbox_receiver.py @@ -101,7 +101,7 @@ def fetch_data(): if resp.status_code != 200: raise Exception(["resp.status_code", resp.status_code]) dropbox_len = meta.size - print(" need %d bytes..." % dropbox_len) + print(f" need {int(dropbox_len)} bytes...") content_len = int(resp.headers.get("Content-Length", -1)) if dropbox_len != content_len: info = ["dropbox_len", dropbox_len, "content_len", content_len] diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index b8f772684..0d38e0bcc 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -110,7 +110,7 @@ def get_total_hits(cur, epiweek, state): for (total,) in cur: pass if total is None: - raise Exception("missing data for %d-%s" % (epiweek, state)) + raise Exception(f"missing data for {int(epiweek)}-{state}") return total @@ -166,7 +166,7 @@ def extract(first_week=None, last_week=None, test_mode=False): cur.execute("SELECT max(`epiweek`) FROM `cdc_meta`") for (last_week,) in cur: pass - print("extracting %d--%d" % (first_week, last_week)) + print(f"extracting {int(first_week)}--{int(last_week)}") # update each epiweek for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): @@ -180,7 +180,7 @@ def extract(first_week=None, last_week=None, test_mode=False): store_result(cur, epiweek, state, *nums, total) print(f" {epiweek}-{state}: {' '.join(str(n) for n in nums)} ({total})") except Exception as ex: - print(" %d-%s: failed" % (epiweek, state), ex) + print(f" {int(epiweek)}-{state}: failed", ex) # raise ex sys.stdout.flush() diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index fef0821b7..0e191267b 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -232,7 +232,7 @@ def parse_zip(zf, level=1): if handler is not None: with zf.open(name) as temp: count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8")))) - print(prefix, " %d rows" % count) + print(prefix, f" {int(count)} rows") else: print(prefix, " (ignored)") diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 86e3b1cd8..9a90dad5c 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -87,7 +87,7 @@ def safe_int(i): def get_rows(cnx, table="ecdc_ili"): # Count and return the number of rows in the `ecdc_ili` table. select = cnx.cursor() - select.execute("SELECT count(1) num FROM %s" % table) + select.execute(f"SELECT count(1) num FROM {table}") for (num,) in select: pass select.close() @@ -100,7 +100,7 @@ def update_from_file(issue, date, dir, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx, "ecdc_ili") - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows @@ -115,9 +115,9 @@ def update_from_file(issue, date, dir, test_mode=False): row["region"] = data[4] row["incidence_rate"] = data[3] rows.append(row) - print(" loaded %d rows" % len(rows)) + print(f" loaded {len(rows)} rows") entries = [obj for obj in rows if obj] - print(" found %d entries" % len(entries)) + print(f" found {len(entries)} entries") sql = """ INSERT INTO @@ -149,7 +149,7 @@ def update_from_file(issue, date, dir, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -171,7 +171,7 @@ def main(): raise Exception("--file and --issue must both be present or absent") date = datetime.datetime.now().strftime("%Y-%m-%d") - print("assuming release date is today, %s" % date) + print(f"assuming release date is today, {date}") ensure_tables_exist() if args.file: diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py index 1e534b740..28105d933 100644 --- a/src/acquisition/flusurv/flusurv.py +++ b/src/acquisition/flusurv/flusurv.py @@ -80,7 +80,7 @@ def fetch_json(path, payload, call_count=1, requests_impl=requests): # it's polite to self-identify this "bot" delphi_url = "https://delphi.cmu.edu/index.html" - user_agent = "Mozilla/5.0 (compatible; delphibot/1.0; +%s)" % delphi_url + user_agent = f"Mozilla/5.0 (compatible; delphibot/1.0; +{delphi_url})" # the FluSurv AMF server flusurv_url = "https://gis.cdc.gov/GRASP/Flu3/" + path @@ -106,7 +106,7 @@ def fetch_json(path, payload, call_count=1, requests_impl=requests): if resp.status_code == 500 and call_count <= 2: # the server often fails with this status, so wait and retry delay = 10 * call_count - print("got status %d, will retry in %d sec..." % (resp.status_code, delay)) + print(f"got status {int(resp.status_code)}, will retry in {int(delay)} sec...") time.sleep(delay) return fetch_json(path, payload, call_count=call_count + 1) elif resp.status_code != 200: @@ -180,7 +180,7 @@ def extract_from_object(data_in): raise Exception("no data found") # print the result and return flu data - print("found data for %d weeks" % len(data_out)) + print(f"found data for {len(data_out)} weeks") return data_out diff --git a/src/acquisition/flusurv/flusurv_update.py b/src/acquisition/flusurv/flusurv_update.py index 295091104..3009c7a3d 100644 --- a/src/acquisition/flusurv/flusurv_update.py +++ b/src/acquisition/flusurv/flusurv_update.py @@ -108,7 +108,7 @@ def update(issue, location_name, test_mode=False): cnx = mysql.connector.connect(host=secrets.db.host, user=u, password=p, database="epidata") cur = cnx.cursor() rows1 = get_rows(cur) - print("rows before: %d" % rows1) + print(f"rows before: {int(rows1)}") # SQL for insert/update sql = """ @@ -148,7 +148,7 @@ def update(issue, location_name, test_mode=False): # commit and disconnect rows2 = get_rows(cur) - print("rows after: %d (+%d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (+{int(rows2 - rows1)})") cur.close() if test_mode: print("test mode: not committing database changes") @@ -170,7 +170,7 @@ def main(): # scrape current issue from the main page issue = flusurv.get_current_issue() - print("current issue: %d" % issue) + print(f"current issue: {int(issue)}") # fetch flusurv data if args.location == "all": diff --git a/src/acquisition/fluview/fluview.py b/src/acquisition/fluview/fluview.py index a7e9fba87..9b4e6f537 100644 --- a/src/acquisition/fluview/fluview.py +++ b/src/acquisition/fluview/fluview.py @@ -108,7 +108,7 @@ def get_tier_ids(name): location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) num = len(location_ids[Key.TierType.hhs]) if num != 10: - raise Exception("expected 10 hhs regions, found %d" % num) + raise Exception(f"expected 10 hhs regions, found {int(num)}") # add location ids for census divisions for row in data[Key.TierListEntry.cen]: @@ -116,7 +116,7 @@ def get_tier_ids(name): location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) num = len(location_ids[Key.TierType.cen]) if num != 9: - raise Exception("expected 9 census divisions, found %d" % num) + raise Exception(f"expected 9 census divisions, found {int(num)}") # add location ids for states for row in data[Key.TierListEntry.sta]: @@ -124,7 +124,7 @@ def get_tier_ids(name): location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) num = len(location_ids[Key.TierType.sta]) if num != 57: - raise Exception("expected 57 states/territories/cities, found %d" % num) + raise Exception(f"expected 57 states/territories/cities, found {int(num)}") # return a useful subset of the metadata # (latest epiweek, latest season, tier ids, location ids) @@ -181,7 +181,7 @@ def save_latest(path=None): data = fetch_metadata(sess) info = get_issue_and_locations(data) issue = info["epiweek"] - print("current issue: %d" % issue) + print(f"current issue: {int(issue)}") # establish timing dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") @@ -200,7 +200,7 @@ def save_latest(path=None): ("cen", Key.TierType.cen), ("sta", Key.TierType.sta), ): - name = "ilinet_%s_%d_%s.zip" % (delphi_name, issue, dt) + name = f"ilinet_{delphi_name}_{int(issue)}_{dt}.zip" if path is None: filename = name else: @@ -209,12 +209,12 @@ def save_latest(path=None): locations = info["location_ids"][cdc_name] # download and show timing information - print("downloading %s" % delphi_name) + print(f"downloading {delphi_name}") t0 = time.time() size = download_data(tier_id, locations, seasons, filename) t1 = time.time() - print(" saved %s (%d bytes in %.1f seconds)" % (filename, size, t1 - t0)) + print(f" saved {filename} ({int(size)} bytes in {t1 - t0:.1f} seconds)") files.append(filename) # return the current issue and the list of downloaded files diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index e463fcbaf..2c2551831 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -297,7 +297,7 @@ def get_rows(cnx, table="fluview"): Looking at the fluview table by default, but may pass parameter to look at public health or clinical lab data instead.""" select = cnx.cursor() - select.execute("SELECT count(1) num FROM %s" % table) + select.execute(f"SELECT count(1) num FROM {table}") for (num,) in select: pass select.close() @@ -313,16 +313,16 @@ def update_from_file_clinical(issue, date, filename, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx, CL_TABLE) - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows - print("loading data from %s as issued on %d" % (filename, issue)) + print(f"loading data from {filename} as issued on {int(issue)}") rows = load_zipped_csv(filename, CL_SHEET) - print(" loaded %d rows" % len(rows)) + print(f" loaded {len(rows)} rows") data = [get_clinical_data(row) for row in rows] entries = [obj for obj in data if obj] - print(" found %d entries" % len(entries)) + print(f" found {len(entries)} entries") sql = """ INSERT INTO @@ -365,7 +365,7 @@ def update_from_file_clinical(issue, date, filename, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -378,16 +378,16 @@ def update_from_file_public(issue, date, filename, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx, PHL_TABLE) - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows - print("loading data from %s as issued on %d" % (filename, issue)) + print(f"loading data from {filename} as issued on {int(issue)}") rows = load_zipped_csv(filename, PHL_SHEET) - print(" loaded %d rows" % len(rows)) + print(f" loaded {len(rows)} rows") data = [get_public_data(row) for row in rows] entries = [obj for obj in data if obj] - print(" found %d entries" % len(entries)) + print(f" found {len(entries)} entries") sql = """ INSERT INTO @@ -434,7 +434,7 @@ def update_from_file_public(issue, date, filename, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -447,16 +447,16 @@ def update_from_file(issue, date, filename, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx) - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows - print("loading data from %s as issued on %d" % (filename, issue)) + print(f"loading data from {filename} as issued on {int(issue)}") rows = load_zipped_csv(filename) - print(" loaded %d rows" % len(rows)) + print(f" loaded {len(rows)} rows") data = [get_ilinet_data(row) for row in rows] entries = [obj for obj in data if obj] - print(" found %d entries" % len(entries)) + print(f" found {len(entries)} entries") sql = """ INSERT INTO @@ -509,7 +509,7 @@ def update_from_file(issue, date, filename, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -531,7 +531,7 @@ def main(): raise Exception("--file and --issue must both be present or absent") date = datetime.datetime.now().strftime("%Y-%m-%d") - print("assuming release date is today, %s" % date) + print(f"assuming release date is today, {date}") if args.file: update_from_file(args.issue, date, args.file, test_mode=args.test) diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index 230dd2f7d..4b3e1d684 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -270,13 +270,13 @@ def impute_missing_values(database, test_mode=False): # database connection database.connect() rows1 = database.count_rows() - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") # iterate over missing epiweeks missing_rows = database.find_missing_rows() - print("missing data for %d epiweeks" % len(missing_rows)) + print(f"missing data for {len(missing_rows)} epiweeks") for issue, epiweek in missing_rows: - print("i=%d e=%d" % (issue, epiweek)) + print(f"i={int(issue)} e={int(epiweek)}") # get known values from table `fluview` known_values = database.get_known_values(issue, epiweek) @@ -317,7 +317,7 @@ def impute_missing_values(database, test_mode=False): # database cleanup rows2 = database.count_rows() - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") commit = not test_mode database.close(commit) diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index 7f65bbfe5..b7d5fd493 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -266,7 +266,7 @@ def get_num_rows(): ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) ew0 = ew0 if first is None else first ew1 = ew1 if last is None else last - print("Checking epiweeks between %d and %d..." % (ew0, ew1)) + print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...") # keep track of how many rows were added rows_before = get_num_rows() @@ -283,7 +283,7 @@ def get_num_rows(): total_rows = 0 ght = GHT(API_KEY) for term in terms: - print(" [%s] using term" % term) + print(f" [{term}] using term") ll, cl = len(locations), len(countries) for i in range(max(ll, cl)): location = locations[i] if i < ll else locations[0] @@ -303,8 +303,7 @@ def get_num_rows(): else: delay = 2**attempt print( - " [%s|%s] caught exception (will retry in %ds):" - % (term, location, delay), + f" [{term}|{location}] caught exception (will retry in {int(delay)}s):", ex, ) time.sleep(delay) @@ -332,15 +331,14 @@ def get_num_rows(): ew = flu.add_epiweeks(ew, 1) if num_missing > 0: print( - " [%s|%s] missing %d/%d value(s)" - % (term, location, num_missing, len(values)) + f" [{term}|{location}] missing {int(num_missing)}/{len(values)} value(s)" ) except Exception as ex: print(f" [{term}|{location}] caught exception (will NOT retry):", ex) # keep track of how many rows were added rows_after = get_num_rows() - print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") # cleanup cur.close() diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 69d751e95..86d8fc690 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -142,12 +142,12 @@ def main(): expected_weeks = result["num_weeks"] received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) if expected_weeks != received_weeks: - raise Exception("expected %d weeks, received %d" % (expected_weeks, received_weeks)) + raise Exception(f"expected {int(expected_weeks)} weeks, received {int(received_weeks)}") # results epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] for (epiweek, value) in zip(epiweeks, values): - print("%6d: %.3f" % (epiweek, value)) + print(f"{int(epiweek):6}: {value:.3f}") if __name__ == "__main__": diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index b2c12dba9..713b21f00 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -84,7 +84,7 @@ def safe_int(i): def get_rows(cnx, table="kcdc_ili"): # Count and return the number of rows in the `kcdc_ili` table. select = cnx.cursor() - select.execute("SELECT count(1) num FROM %s" % table) + select.execute(f"SELECT count(1) num FROM {table}") for (num,) in select: pass select.close() @@ -126,7 +126,7 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx) - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() sql = """ @@ -160,7 +160,7 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -173,7 +173,7 @@ def main(): args = parser.parse_args() date = datetime.datetime.now().strftime("%Y-%m-%d") - print("assuming release date is today, %s" % date) + print(f"assuming release date is today, {date}") issue = EpiDate.today().get_ew() ensure_tables_exist() diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index 57f4e272d..b2e369e63 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -121,7 +121,7 @@ def _get_metadata(html): match = release_pattern.match(line) if match is not None: year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) - release = "%04d-%02d-%02d" % (year, month, day) + release = f"{int(year):04}-{int(month):02}-{int(day):02}" if issue is None or release is None: raise Exception("metadata not found") return issue, release @@ -173,7 +173,7 @@ def get_flu_data(): # Fetch the flu page response = requests.get(NIDSS.FLU_URL) if response.status_code != 200: - raise Exception("request failed [%d]" % response.status_code) + raise Exception(f"request failed [{int(response.status_code)}]") html = response.text # Parse metadata latest_week, release_date = NIDSS._get_metadata(html) @@ -199,7 +199,7 @@ def get_dengue_data(first_week, last_week): # Download CSV response = requests.get(NIDSS.DENGUE_URL) if response.status_code != 200: - raise Exception("export Dengue failed [%d]" % response.status_code) + raise Exception(f"export Dengue failed [{int(response.status_code)}]") csv = response.content.decode("big5-tw") # Parse the data lines = [l.strip() for l in csv.split("\n")[1:] if l.strip() != ""] @@ -231,7 +231,7 @@ def get_dengue_data(first_week, last_week): continue if epiweek not in data or location not in data[epiweek]: # Not a vaild U.S. epiweek - raise Exception("data missing %d-%s" % (epiweek, location)) + raise Exception(f"data missing {int(epiweek)}-{location}") # Add the counts to the location on this epiweek data[epiweek][location] += count # Return results indexed by week and location @@ -258,12 +258,12 @@ def main(): print("*** Flu ***") for region in sorted(list(fdata[ew].keys())): visits, ili = fdata[ew][region]["visits"], fdata[ew][region]["ili"] - print("region=%s | visits=%d | ili=%.3f" % (region, visits, ili)) + print(f"region={region} | visits={int(visits)} | ili={ili:.3f}") print("*** Dengue ***") for location in sorted(list(ddata[ew].keys())): region = NIDSS.LOCATION_TO_REGION[location] count = ddata[ew][location] - print("location=%s | region=%s | count=%d" % (location, region, count)) + print(f"location={location} | region={region} | count={int(count)}") if __name__ == "__main__": diff --git a/src/acquisition/nidss/taiwan_update.py b/src/acquisition/nidss/taiwan_update.py index c22f0dfaa..30d458481 100644 --- a/src/acquisition/nidss/taiwan_update.py +++ b/src/acquisition/nidss/taiwan_update.py @@ -107,8 +107,8 @@ def update(test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx) - print("rows before (flu): %d" % (rows1[0])) - print("rows before (dengue): %d" % (rows1[1])) + print(f"rows before (flu): {int(rows1[0])}") + print(f"rows before (dengue): {int(rows1[1])}") insert = cnx.cursor() sql_flu = """ INSERT INTO @@ -149,8 +149,8 @@ def update(test_mode=False): # Cleanup insert.close() rows2 = get_rows(cnx) - print("rows after (flu): %d (added %d)" % (rows2[0], rows2[0] - rows1[0])) - print("rows after (dengue): %d (added %d)" % (rows2[1], rows2[1] - rows1[1])) + print(f"rows after (flu): {int(rows2[0])} (added {int(rows2[0] - rows1[0])})") + print(f"rows after (dengue): {int(rows2[1])} (added {int(rows2[1] - rows1[1])})") if test_mode: print("test mode: changes not commited") else: diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index 67fbc1d28..04e4dfe1a 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -110,7 +110,7 @@ def safe_int(i): def get_rows(cnx, table="paho_dengue"): # Count and return the number of rows in the `fluview` table. select = cnx.cursor() - select.execute("SELECT count(1) num FROM %s" % table) + select.execute(f"SELECT count(1) num FROM {table}") for (num,) in select: pass select.close() @@ -171,19 +171,19 @@ def update_from_file(issue, date, filename, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx, "paho_dengue") - print("rows before: %d" % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows - print("loading data from %s as issued on %d" % (filename, issue)) + print(f"loading data from {filename} as issued on {int(issue)}") with open(filename, encoding="utf-8") as f: c = f.read() rows = [] for l in csv.reader(StringIO(c), delimiter=","): rows.append(get_paho_row(l)) - print(" loaded %d rows" % len(rows)) + print(f" loaded {len(rows)} rows") entries = [obj for obj in rows if obj] - print(" found %d entries" % len(entries)) + print(f" found {len(entries)} entries") sql = """ INSERT INTO @@ -227,7 +227,7 @@ def update_from_file(issue, date, filename, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -249,7 +249,7 @@ def main(): raise Exception("--file and --issue must both be present or absent") date = datetime.datetime.now().strftime("%Y-%m-%d") - print("assuming release date is today, %s" % date) + print(f"assuming release date is today, {date}") if args.file: update_from_file(args.issue, date, args.file, test_mode=args.test) diff --git a/src/acquisition/paho/paho_download.py b/src/acquisition/paho/paho_download.py index 5308ec93f..c6fa70285 100644 --- a/src/acquisition/paho/paho_download.py +++ b/src/acquisition/paho/paho_download.py @@ -23,15 +23,15 @@ def wait_for(browser, css_selector, delay=10): WebDriverWait(browser, delay).until( EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector)) ) - print("Success Loading %s" % (css_selector)) + print(f"Success Loading {css_selector}") except TimeoutException: - print("Loading %s took too much time!" % (css_selector)) + print(f"Loading {css_selector} took too much time!") def find_and_click(browser, element): element.location_once_scrolled_into_view browser.switch_to.default_content() - browser.execute_script("window.scrollBy(0,-%d)" % headerheight) + browser.execute_script(f"window.scrollBy(0,-{int(headerheight)})") browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) element.click() @@ -130,9 +130,9 @@ def get_paho_data(offset=0, dir="downloads"): # print gp.is_displayed() try: WebDriverWait(browser, 10).until(EC.staleness_of(gp)) - print("Loaded next week % d" % (53 - offset)) + print(f"Loaded next week {int(53 - offset)}") except TimeoutException: - print("Loading next week %d took too much time!" % (53 - offset)) + print(f"Loading next week {int(53 - offset)} took too much time!") gp = browser.find_element_by_css_selector("div.wcGlassPane") # print gp.is_enabled() # print gp.is_selected() @@ -147,7 +147,7 @@ def get_paho_data(offset=0, dir="downloads"): for i in range(54 - offset): # If something goes wrong for whatever reason, try from the beginning try: - print("Loading week %d" % (53 - i)) + print(f"Loading week {int(53 - i)}") # (Re-)load URL browser.switch_to.window(tab2) browser.get(dataurl) @@ -182,7 +182,7 @@ def get_paho_data(offset=0, dir="downloads"): find_and_click(browser, x) curr_offset += 1 except Exception as e: - print("Got exception %s\nTrying again from week %d" % (e, 53 - offset)) + print(f"Got exception {e}\nTrying again from week {int(53 - offset)}") browser.quit() get_paho_data(offset=curr_offset) browser.quit() diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index 06f8b9da5..267200643 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -79,7 +79,7 @@ def get_num_rows(): ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) ew0 = ew0 if first is None else first ew1 = ew1 if last is None else last - print("Checking epiweeks between %d and %d..." % (ew0, ew1)) + print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...") # keep track of how many rows were added rows_before = get_num_rows() @@ -109,11 +109,11 @@ def get_num_rows(): if v == 0: num_missing += 1 if num_missing > 0: - print(" [%s] missing %d/%d value(s)" % (location, num_missing, len(ews))) + print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)") # keep track of how many rows were added rows_after = get_num_rows() - print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") # cleanup cur.close() diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index f64bbd689..13828af74 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -104,7 +104,7 @@ def __init__(self, username, password, debug=False): response = self._go("https://www.healthtweets.org/accounts/login") token = self._get_token(response.text) if self.debug: - print("token=%s" % (token)) + print(f"token={token}") data = { "csrfmiddlewaretoken": token, "username": username, @@ -145,13 +145,10 @@ def _get_values(self, state, date1, date2, normalized): d1, d2 = datetime.strptime(date1, "%Y-%m-%d"), datetime.strptime(date2, "%Y-%m-%d") s1, s2 = d1.strftime("%m%%2F%d%%2F%Y"), d2.strftime("%m%%2F%d%%2F%Y") count_type = "normalized" if normalized else "raw" - url = ( - "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" - % (count_type, (d2 - d1).days, s1, s2, state_code) - ) response = self._go( - "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" - % (count_type, (d2 - d1).days, s1, s2, state_code) + "https://www.healthtweets.org/trends/plot?resolution=Day" + f"&count_type={count_type}&dayNum={(d2 - d1).days}&from={s1}" + f"&to={s2}&plot1_disease=65&location_plot1={int(state_code)}" ) # print(state, date1, date2, normalized) # print(url) @@ -179,7 +176,9 @@ def check_state(self, state): raise Exception("invalid state") state_code = HealthTweets.STATE_CODES[state] response = self._go( - "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d" % (state_code) + "https://www.healthtweets.org/trends/plot?resolution=Day" + "&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015" + f"&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1={int(state_code)}" ) lines = [line.strip() for line in response.text.split("\n")] data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] @@ -198,7 +197,7 @@ def _get_token(self, html): def _go(self, url, method=None, referer=None, data=None): if self.debug: - print("%s" % (url)) + print(url) if method is None: if data is None: method = self.session.get @@ -208,8 +207,8 @@ def _go(self, url, method=None, referer=None, data=None): html = response.text if self.debug: for item in response.history: - print(" [%d to %s]" % (item.status_code, item.headers["Location"])) - print(" %d (%d bytes)" % (response.status_code, len(html))) + print(f" [{int(item.status_code)} to {item.headers['Location']}]") + print(f" {int(response.status_code)} ({len(html)} bytes)") return response diff --git a/src/acquisition/twtr/twitter_update.py b/src/acquisition/twtr/twitter_update.py index 4354c5a80..80a023f19 100644 --- a/src/acquisition/twtr/twitter_update.py +++ b/src/acquisition/twtr/twitter_update.py @@ -102,7 +102,7 @@ def get_num_rows(): # keep track of how many rows were added rows_after = get_num_rows() - print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") # cleanup cur.close() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index c32fc87ed..8cb586c24 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -119,10 +119,10 @@ def extract_article_counts_orig(articles, debug_mode): counts = {} for article in articles: if debug_mode: - print(" %s" % (article)) + print(f" {article}") out = text( subprocess.check_output( - 'LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True + f'LC_ALL=C grep -a -i "^en {article.lower()} " raw2 | cat', shell=True ) ).strip() count = 0 @@ -130,13 +130,13 @@ def extract_article_counts_orig(articles, debug_mode): for line in out.split("\n"): fields = line.split() if len(fields) != 4: - print("unexpected article format: [%s]" % (line)) + print(f"unexpected article format: [{line}]") else: count += int(fields[2]) # print ' %4d %s'%(count, article) counts[article.lower()] = count if debug_mode: - print(" %d" % (count)) + print(f" {int(count)}") print("getting total count...") out = text( subprocess.check_output( @@ -154,7 +154,7 @@ def extract_article_counts_orig(articles, debug_mode): def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, debug_mode=False): worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() - print("this is [%s]" % (worker)) + print(f"this is [{worker}]") if debug_mode: print("*** running in debug mode ***") @@ -166,7 +166,7 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d ): try: time_start = datetime.datetime.now() - req = urlopen(MASTER_URL + "?get=x&type=%s" % (job_type)) + req = urlopen(MASTER_URL + f"?get=x&type={job_type}") code = req.getcode() if code != 200: if code == 201: @@ -178,7 +178,7 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d print("nothing to do, exiting") return else: - raise Exception("server response code (get) was %d" % (code)) + raise Exception(f"server response code (get) was {int(code)}") # Make the code compatible with mac os system if platform == "darwin": job_content = text(req.readlines()[1]) @@ -193,19 +193,17 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d print("nothing to do, exiting") return job = json.loads(job_content) - print("received job [%d|%s]" % (job["id"], job["name"])) + print(f"received job [{int(job['id'])}|{job['name']}]") # updated parsing for pageviews - maybe use a regex in the future # year, month = int(job['name'][11:15]), int(job['name'][15:17]) year, month = int(job["name"][10:14]), int(job["name"][14:16]) # print 'year=%d | month=%d'%(year, month) - url = "https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s" % ( - year, - year, - month, - job["name"], + url = ( + "https://dumps.wikimedia.org/other/" + f"pageviews/{year}/{year}-{month:02d}/{job['name']}" ) - print("downloading file [%s]..." % (url)) - subprocess.check_call("curl -s %s > raw.gz" % (url), shell=True) + print(f"downloading file [{url}]...") + subprocess.check_call(f"curl -s {url} > raw.gz", shell=True) print("checking file size...") # Make the code cross-platfrom, so use python to get the size of the file # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) @@ -259,16 +257,16 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d payload = json.dumps(result) hmac_str = get_hmac_sha256(secret, payload) if debug_mode: - print(" hmac: %s" % hmac_str) + print(f" hmac: {hmac_str}") post_data = urlencode({"put": payload, "hmac": hmac_str}) req = urlopen(MASTER_URL, data=data(post_data)) code = req.getcode() if code != 200: - raise Exception("server response code (put) was %d" % (code)) - print("done! (dl=%d)" % (total_download)) + raise Exception(f"server response code (put) was {int(code)}") + print(f"done! (dl={int(total_download)})") passed_jobs += 1 except Exception as ex: - print("***** Caught Exception: %s *****" % (str(ex))) + print(f"***** Caught Exception: {str(ex)} *****") failed_jobs += 1 time.sleep(30) print( @@ -278,9 +276,9 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d time.sleep(sleep_time) if download_limit is not None and total_download >= download_limit: - print("download limit has been reached [%d >= %d]" % (total_download, download_limit)) + print(f"download limit has been reached [{int(total_download)} >= {int(download_limit)}]") if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: - print("job limit has been reached [%d >= %d]" % (passed_jobs + failed_jobs, job_limit)) + print(f"job limit has been reached [{int(passed_jobs + failed_jobs)} >= {int(job_limit)}]") def main(): diff --git a/src/acquisition/wiki/wiki_extract.py b/src/acquisition/wiki/wiki_extract.py index f4e0efb96..718a64c20 100644 --- a/src/acquisition/wiki/wiki_extract.py +++ b/src/acquisition/wiki/wiki_extract.py @@ -91,12 +91,12 @@ def run(job_limit=100): jobs = [] for (id, name, data_str) in cur: jobs.append((id, name, json.loads(data_str))) - print("Processing data from %d jobs" % (len(jobs))) + print(f"Processing data from {len(jobs)} jobs") # get the counts from the json object and insert into (or update) the database # Notice that data_collect contains data with different languages for (id, name, data_collect) in jobs: - print("processing job [%d|%s]..." % (id, name)) + print(f"processing job [{int(id)}|{name}]...") timestamp = round_timestamp(get_timestamp(name)) for language in data_collect.keys(): data = data_collect[language] diff --git a/src/acquisition/wiki/wiki_update.py b/src/acquisition/wiki/wiki_update.py index c9aa6d6a2..a9f240629 100644 --- a/src/acquisition/wiki/wiki_update.py +++ b/src/acquisition/wiki/wiki_update.py @@ -68,8 +68,8 @@ def get_timestamp(name): def get_manifest(year, month, optional=False): # unlike pagecounts-raw, pageviews doesn't provide hashes # url = 'https://dumps.wikimedia.org/other/pagecounts-raw/%d/%d-%02d/md5sums.txt'%(year, year, month) - url = "https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/" % (year, year, month) - print("Checking manifest at %s..." % (url)) + url = f"https://dumps.wikimedia.org/other/pageviews/{int(year)}/{int(year)}-{int(month):02}/" + print(f"Checking manifest at {url}...") response = requests.get(url) if response.status_code == 200: # manifest = [line.strip().split() for line in response.text.split('\n') if 'pagecounts' in line] @@ -82,8 +82,8 @@ def get_manifest(year, month, optional=False): if optional: manifest = [] else: - raise Exception("expected 200 status code, but got %d" % (response.status_code)) - print("Found %d access log(s)" % (len(manifest))) + raise Exception(f"expected 200 status code, but got {int(response.status_code)}") + print(f"Found {len(manifest)} access log(s)") return manifest @@ -98,7 +98,7 @@ def run(): cur.execute("SELECT max(`name`) FROM `wiki_raw`") for (max_name,) in cur: pass - print("Last known file: %s" % (max_name)) + print(f"Last known file: {max_name}") timestamp = get_timestamp(max_name) # crawl dumps.wikimedia.org to find more recent access logs @@ -113,7 +113,7 @@ def run(): if max_name is None or name > max_name: new_logs[name] = hash print(f" New job: {name} [{hash}]") - print("Found %d new job(s)" % (len(new_logs))) + print(f"Found {len(new_logs)} new job(s)") # store metadata for new jobs for name in sorted(new_logs.keys()): From 8459cc298b931f0a6ea9342c5ab225f24623b69f Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 26 Jun 2023 09:58:26 -0700 Subject: [PATCH 37/43] gh: add .git-blame-ignore-revs --- .git-blame-ignore-revs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..97dc620be --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,24 @@ +# style(black): format cdc acquisition +980b0b7e80c7923b79e14fee620645e680785703 +# style(black): format covidcast_nowcast acquisition +9e6ff16f599e8feec34a08dd1bddbc5eae347b55 +# style(black): format ecdc acquisition +d1141d904da4e62992b97c92d5caebd8fadffd42 +# style(black): format flusurv acquisition +08af0f6b7bff85bbc2b193b63b5abf6a16ba03e4 +# style(black): format fluview acquisition +0133ef2042c4df8867e91595eb1f64873edb4632 +# style(black): format ght acquisition +b8900a0bc846888885310911efd6e26459effa99 +# style(black): format kcdc acquisition +a849384c884934b3b7c3c67b68aa6240277d6b6d +# style(black): format nidss acquisition +d04af3c02fda7708a16bec0952b1aa7475acaec7 +# style(black): format paho acquisition +7f60fbba572c1b6e5153a9ef216895bdc2f7f5b3 +# style(black): format quidel acquisition +b9ceb400d9248c8271e8342275664ac5524e335d +# style(black): format twitter acquisition +07ed83e5768f717ab0f9a62a9209e4e2cffa058d +# style(black): format wiki acquisition +923852eafa86b8f8b182d499489249ba8f815843 From f93f0208fe3d7de45fe72092ed88fadb203d1998 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 26 Jun 2023 10:42:57 -0700 Subject: [PATCH 38/43] style(acquisition): minor formatting fixes --- src/acquisition/ecdc/ecdc_db_update.py | 14 ++++-- src/acquisition/flusurv/flusurv_update.py | 11 ++++- src/acquisition/fluview/fluview_update.py | 14 ++++-- src/acquisition/ght/ght_update.py | 44 ++++++++++++++---- src/acquisition/ght/google_health_trends.py | 30 ++++++++++-- src/acquisition/paho/paho_db_update.py | 14 ++++-- src/acquisition/quidel/quidel.py | 2 +- src/acquisition/quidel/quidel_update.py | 38 +++++++++++++-- src/acquisition/twtr/healthtweets.py | 45 +++++++++++++++--- src/acquisition/wiki/wiki_download.py | 51 ++++++++++++++++++--- 10 files changed, 221 insertions(+), 42 deletions(-) diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 9a90dad5c..84423c376 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -156,15 +156,23 @@ def update_from_file(issue, date, dir, test_mode=False): def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument( - "--test", action="store_true", help="do dry run only, do not update the database" + "--test", + action="store_true", + help="do dry run only, do not update the database" ) parser.add_argument( - "--file", type=str, help="load an existing zip file (otherwise fetch current data)" + "--file", + type=str, + help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - "--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given" + "--issue", + type=int, + help="issue of the file (e.g. 201740); used iff --file is given" ) + # fmt: on args = parser.parse_args() if (args.file is None) != (args.issue is None): diff --git a/src/acquisition/flusurv/flusurv_update.py b/src/acquisition/flusurv/flusurv_update.py index 3009c7a3d..1aa8e9885 100644 --- a/src/acquisition/flusurv/flusurv_update.py +++ b/src/acquisition/flusurv/flusurv_update.py @@ -160,12 +160,19 @@ def update(issue, location_name, test_mode=False): def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument( - "location", help='location for which data should be scraped (e.g. "CA" or "all")' + "location", + help='location for which data should be scraped (e.g. "CA" or "all")' ) parser.add_argument( - "--test", "-t", default=False, action="store_true", help="do not commit database changes" + "--test", + "-t", + default=False, + action="store_true", + help="do not commit database changes" ) + # fmt: on args = parser.parse_args() # scrape current issue from the main page diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index 2c2551831..406725b8a 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -516,15 +516,23 @@ def update_from_file(issue, date, filename, test_mode=False): def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument( - "--test", action="store_true", help="do dry run only, do not update the database" + "--test", + action="store_true", + help="do dry run only, do not update the database" ) parser.add_argument( - "--file", type=str, help="load an existing zip file (otherwise fetch current data)" + "--file", + type=str, + help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - "--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given" + "--issue", + type=int, + help="issue of the file (e.g. 201740); used iff --file is given" ) + # fmt: on args = parser.parse_args() if (args.file is None) != (args.issue is None): diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index b7d5fd493..9e8d48d1d 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -330,9 +330,7 @@ def get_num_rows(): # print(' [%s|%s|%d] missing value' % (term, location, ew)) ew = flu.add_epiweeks(ew, 1) if num_missing > 0: - print( - f" [{term}|{location}] missing {int(num_missing)}/{len(values)} value(s)" - ) + print(f" [{term}|{location}] missing {int(num_missing)}/{len(values)} value(s)") except Exception as ex: print(f" [{term}|{location}] caught exception (will NOT retry):", ex) @@ -350,11 +348,41 @@ def main(): # args and usage parser = argparse.ArgumentParser() # fmt: off - parser.add_argument("location", action="store", type=str, default=None, help="location(s) (ex: all; US; TX; CA,LA,WY)") - parser.add_argument("term", action="store", type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') - parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") - parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") - parser.add_argument("--country", "-c", default="US", type=str, help="location country (ex: US; BR)") + parser.add_argument( + "location", + action="store", + type=str, + default=None, + help="location(s) (ex: all; US; TX; CA,LA,WY)" + ) + parser.add_argument( + "term", + action="store", + type=str, + default=None, + help='term/query/topic (ex: all; /m/0cycc; "flu fever")' + ) + parser.add_argument( + "--first", + "-f", + default=None, + type=int, + help="first epiweek override" + ) + parser.add_argument( + "--last", + "-l", + default=None, + type=int, + help="last epiweek override" + ) + parser.add_argument( + "--country", + "-c", + default="US", + type=str, + help="location country (ex: US; BR)" + ) # fmt: on args = parser.parse_args() diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 86d8fc690..4bb8df25f 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -116,19 +116,39 @@ def main(): parser = argparse.ArgumentParser() # fmt: off parser.add_argument( - "apikey", action="store", type=str, default=None, help="API key" + "apikey", + action="store", + type=str, + default=None, + help="API key" ) parser.add_argument( - "startweek", action="store", type=int, default=None, help="first week (ex: 201440)" + "startweek", + action="store", + type=int, + default=None, + help="first week (ex: 201440)" ) parser.add_argument( - "endweek", action="store", type=int, default=None, help="last week (ex: 201520)" + "endweek", + action="store", + type=int, + default=None, + help="last week (ex: 201520)" ) parser.add_argument( - "location", action="store", type=str, default=None, help="location (ex: US)" + "location", + action="store", + type=str, + default=None, + help="location (ex: US)" ) parser.add_argument( - "term", action="store", type=str, default=None, help="term/query/topic (ex: /m/0cycc)" + "term", + action="store", + type=str, + default=None, + help="term/query/topic (ex: /m/0cycc)" ) # fmt: on args = parser.parse_args() diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index 04e4dfe1a..b351d3ff2 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -234,15 +234,23 @@ def update_from_file(issue, date, filename, test_mode=False): def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument( - "--test", action="store_true", help="do dry run only, do not update the database" + "--test", + action="store_true", + help="do dry run only, do not update the database" ) parser.add_argument( - "--file", type=str, help="load an existing zip file (otherwise fetch current data)" + "--file", + type=str, + help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - "--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given" + "--issue", + type=int, + help="issue of the file (e.g. 201740); used iff --file is given" ) + # fmt: on args = parser.parse_args() if (args.file is None) != (args.issue is None): diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index 3af99774f..0540d5e7c 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -140,7 +140,7 @@ def retrieve_excels(self): m.select("INBOX") # here you a can choose a mail box like INBOX instead # use m.list() to get all the mailboxes # you could filter using the IMAP rules here (check https://www.example-code.com/csharp/imap-search-critera.asp) - _, items = m.search(None, "ALL") + _, items = m.search(None, "ALL") items = items[0].split() # getting the mails id # The emailids are ordered from past to now diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index 267200643..563cea898 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -125,11 +125,39 @@ def main(): # args and usage parser = argparse.ArgumentParser() # fmt: off - parser.add_argument("--location", action="store", type=str, default=None, help="location(s) (ex: all; any of hhs1-10)") - parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") - parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") - parser.add_argument("--force_update", "-u", action="store_true", help="force update db values") - parser.add_argument("--skip_email", "-s", action="store_true", help="skip email downloading step") + parser.add_argument( + "--location", + action="store", + type=str, + default=None, + help="location(s) (ex: all; any of hhs1-10)" + ) + parser.add_argument( + "--first", + "-f", + default=None, + type=int, + help="first epiweek override" + ) + parser.add_argument( + "--last", + "-l", + default=None, + type=int, + help="last epiweek override" + ) + parser.add_argument( + "--force_update", + "-u", + action="store_true", + help="force update db values" + ) + parser.add_argument( + "--skip_email", + "-s", + action="store_true", + help="skip email downloading step" + ) # fmt: on args = parser.parse_args() diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index 13828af74..c1e345162 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -216,12 +216,45 @@ def main(): # args and usage parser = argparse.ArgumentParser() # fmt: off - parser.add_argument("username", action="store", type=str, help="healthtweets.org username") - parser.add_argument("password", action="store", type=str, help="healthtweets.org password") - parser.add_argument("state", action="store", type=str, choices=list(HealthTweets.STATE_CODES.keys()), help="U.S. state (ex: TX)") - parser.add_argument("date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)") - parser.add_argument("date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)") - parser.add_argument("-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode") + parser.add_argument( + "username", + action="store", + type=str, + help="healthtweets.org username" + ) + parser.add_argument( + "password", + action="store", + type=str, + help="healthtweets.org password" + ) + parser.add_argument( + "state", + action="store", + type=str, + choices=list(HealthTweets.STATE_CODES.keys()), + help="U.S. state (ex: TX)" + ) + parser.add_argument( + "date1", + action="store", + type=str, + help="first date, inclusive (ex: 2015-01-01)" + ) + parser.add_argument( + "date2", + action="store", + type=str, + help="last date, inclusive (ex: 2015-01-01)" + ) + parser.add_argument( + "-d", + "--debug", + action="store_const", + const=True, + default=False, + help="enable debug mode" + ) # fmt: on args = parser.parse_args() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 8cb586c24..6192eab02 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -288,12 +288,51 @@ def main(): # args and usage parser = argparse.ArgumentParser() # fmt: off - parser.add_argument("secret", type=str, help="hmac secret key") - parser.add_argument("-b", "--blimit", action="store", type=int, default=None, help="download limit, in bytes") - parser.add_argument("-j", "--jlimit", action="store", type=int, default=None, help="job limit") - parser.add_argument("-s", "--sleep", action="store", type=int, default=1, help="seconds to sleep between each job") - parser.add_argument("-t", "--type", action="store", type=int, default=0, help="type of job") - parser.add_argument("-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode") + parser.add_argument( + "secret", + type=str, + help="hmac secret key" + ) + parser.add_argument( + "-b", + "--blimit", + action="store", + type=int, + default=None, + help="download limit, in bytes" + ) + parser.add_argument( + "-j", + "--jlimit", + action="store", + type=int, + default=None, + help="job limit" + ) + parser.add_argument( + "-s", + "--sleep", + action="store", + type=int, + default=1, + help="seconds to sleep between each job" + ) + parser.add_argument( + "-t", + "--type", + action="store", + type=int, + default=0, + help="type of job" + ) + parser.add_argument( + "-d", + "--debug", + action="store_const", + const=True, + default=False, + help="enable debug mode" + ) # fmt: on args = parser.parse_args() From 27ea8810e59fa37484b8411f4d7a1ce9643404da Mon Sep 17 00:00:00 2001 From: Dmytro Trotsko Date: Mon, 26 Jun 2023 23:45:30 +0300 Subject: [PATCH 39/43] Resolved conflicts --- src/acquisition/fluview/fluview_notify.py | 2 +- src/acquisition/fluview/fluview_update.py | 10 +++++----- src/acquisition/fluview/impute_missing_values.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/acquisition/fluview/fluview_notify.py b/src/acquisition/fluview/fluview_notify.py index a280889a5..3ed1a243f 100644 --- a/src/acquisition/fluview/fluview_notify.py +++ b/src/acquisition/fluview/fluview_notify.py @@ -46,7 +46,7 @@ # connect u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cnx = mysql.connector.connect(user=u, password=p, database="epidata", host=secrets.db.host) cur = cnx.cursor() # get the last known issue from the automation table `variables` diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index 406725b8a..defd01dad 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -311,7 +311,7 @@ def update_from_file_clinical(issue, date, filename, test_mode=False): # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cnx = mysql.connector.connect(user=u, password=p, database="epidata", host=secrets.db.host) rows1 = get_rows(cnx, CL_TABLE) print(f"rows before: {int(rows1)}") insert = cnx.cursor() @@ -364,7 +364,7 @@ def update_from_file_clinical(issue, date, filename, test_mode=False): rows2 = rows1 else: cnx.commit() - rows2 = get_rows(cnx) + rows2 = get_rows(cnx, CL_TABLE) print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -376,7 +376,7 @@ def update_from_file_public(issue, date, filename, test_mode=False): # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cnx = mysql.connector.connect(user=u, password=p, database="epidata", host=secrets.db.host) rows1 = get_rows(cnx, PHL_TABLE) print(f"rows before: {int(rows1)}") insert = cnx.cursor() @@ -433,7 +433,7 @@ def update_from_file_public(issue, date, filename, test_mode=False): rows2 = rows1 else: cnx.commit() - rows2 = get_rows(cnx) + rows2 = get_rows(cnx, PHL_TABLE) print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() @@ -445,7 +445,7 @@ def update_from_file(issue, date, filename, test_mode=False): # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cnx = mysql.connector.connect(user=u, password=p, database="epidata", host=secrets.db.host) rows1 = get_rows(cnx) print(f"rows before: {int(rows1)}") insert = cnx.cursor() diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index 4b3e1d684..c795d9cce 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -135,7 +135,7 @@ class Sql: def connect(self): """Connect to the database.""" u, p = secrets.db.epi - self.cnx = mysql.connector.connect(user=u, password=p, database="epidata") + self.cnx = mysql.connector.connect(user=u, password=p, database="epidata", host=secrets.db.host) self.cur = self.cnx.cursor() def close(self, commit): From dc4d74a933482386a8f1e773e118386cef8434c8 Mon Sep 17 00:00:00 2001 From: melange396 Date: Mon, 26 Jun 2023 18:08:02 -0400 Subject: [PATCH 40/43] re-enable tracking of last time an api key was used (#1213) --- src/server/_security.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/server/_security.py b/src/server/_security.py index 61e2608b2..b40ac445e 100644 --- a/src/server/_security.py +++ b/src/server/_security.py @@ -121,9 +121,8 @@ def decorated_function(*args, **kwargs): def update_key_last_time_used(user): - # TODO: reenable this once cc<-->aws latency issues are sorted out, or maybe do this call asynchronously - return if user: # update last usage for this user's api key to "now()" + # TODO: consider making this call asynchronously r = redis.Redis(host=REDIS_HOST, password=REDIS_PASSWORD) r.set(f"LAST_USED/{user.api_key}", datetime.strftime(datetime.now(), "%Y-%m-%d")) From 5657dee1a2ba4c90db80896b230e08f3d091c491 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 22:47:04 -0400 Subject: [PATCH 41/43] Bump requests from 2.28.1 to 2.31.0 (#1173) Bumps [requests](https://github.com/psf/requests) from 2.28.1 to 2.31.0. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.28.1...v2.31.0) --- updated-dependencies: - dependency-name: requests dependency-type: direct:development ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.api.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.api.txt b/requirements.api.txt index c7de90997..e9c1418df 100644 --- a/requirements.api.txt +++ b/requirements.api.txt @@ -12,7 +12,7 @@ pandas==1.2.3 python-dotenv==0.15.0 pyyaml redis==3.5.3 -requests==2.28.1 +requests==2.31.0 scipy==1.6.2 SQLAlchemy==1.4.40 structlog==22.1.0 From e0a09402316321f6aa7dd90034ada2fb7e904563 Mon Sep 17 00:00:00 2001 From: Dmytro Trotsko Date: Tue, 27 Jun 2023 19:01:51 +0300 Subject: [PATCH 42/43] Added new constraint fluview_clinical. Added migration file to remove duplicates from fluview_clinical --- src/ddl/fluview.sql | 2 +- src/ddl/migrations/fluview_clinical_v0.1.sql | 115 +++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 src/ddl/migrations/fluview_clinical_v0.1.sql diff --git a/src/ddl/fluview.sql b/src/ddl/fluview.sql index 9da1589ce..11f10c9dc 100644 --- a/src/ddl/fluview.sql +++ b/src/ddl/fluview.sql @@ -269,8 +269,8 @@ CREATE TABLE `fluview_clinical` ( `percent_a` double DEFAULT NULL, `percent_b` double DEFAULT NULL, PRIMARY KEY (`id`), + UNIQUE KEY `issue` (`issue`, `epiweek`, `region`), KEY `release_date` (`release_date`), - KEY `issue` (`issue`), KEY `epiweek` (`epiweek`), KEY `region` (`region`), KEY `lag` (`lag`) diff --git a/src/ddl/migrations/fluview_clinical_v0.1.sql b/src/ddl/migrations/fluview_clinical_v0.1.sql new file mode 100644 index 000000000..0b8aa5855 --- /dev/null +++ b/src/ddl/migrations/fluview_clinical_v0.1.sql @@ -0,0 +1,115 @@ +USE epidata; + +-- Create new `fluview_clinical` table with proper unique constraint. +CREATE TABLE `fluview_clinical_v2` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `release_date` date NOT NULL, + `issue` int(11) NOT NULL, + `epiweek` int(11) NOT NULL, + `region` varchar(12) NOT NULL, + `lag` int(11) NOT NULL, + `total_specimens` int(11) NOT NULL, + `total_a` int(11) DEFAULT NULL, + `total_b` int(11) DEFAULT NULL, + `percent_positive` double DEFAULT NULL, + `percent_a` double DEFAULT NULL, + `percent_b` double DEFAULT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `issue` (`issue`, `epiweek`, `region`), + KEY `release_date` (`release_date`), + KEY `epiweek` (`epiweek`), + KEY `region` (`region`), + KEY `lag` (`lag`) +) ENGINE = InnoDB DEFAULT CHARSET = utf8; + + +-- Insert unique rows from `fluview_clinical` into `fluview_clinical_v2`. +-- This is done in order to reset ID counter and fill gaps betwen row's ids. +INSERT INTO + fluview_clinical_v2( + `release_date`, + `issue`, + `epiweek`, + `region`, + `lag`, + `total_specimens`, + `total_a`, + `total_b`, + `percent_positive`, + `percent_a`, + `percent_b` + ) +SELECT + min_release_date release_date, + tmp.issue, + tmp.epiweek, + tmp.region, + tmp.lag, + tmp.total_specimens, + tmp.total_a, + tmp.total_b, + tmp.percent_positive, + tmp.percent_a, + tmp.percent_b +FROM + ( + -- get data associated with the most recent `release_date` for each unique `(epiweek, issue, region)` key + SELECT + s.release_date, + s.issue, + s.epiweek, + s.region, + s.lag, + s.total_specimens, + s.total_a, + s.total_b, + s.percent_positive, + s.percent_a, + s.percent_b + FROM + ( + SELECT + fc.release_date, + fc.issue, + fc.epiweek, + fc.region, + fc.lag, + fc.total_specimens, + fc.total_a, + fc.total_b, + fc.percent_positive, + fc.percent_a, + fc.percent_b, + ROW_NUMBER() OVER( + PARTITION BY fc.epiweek, + fc.issue, + fc.region + ORDER BY + fc.release_date DESC + ) as row_num + FROM + fluview_clinical fc + ) s + WHERE + s.row_num = 1 + ) tmp + JOIN ( + -- JOIN to recover first/least `release_date` because thats what the acquisition process does: https://github.com/cmu-delphi/delphi-epidata/blob/7fd20cd5c34b33c2310be67867b46a91aa840be9/src/acquisition/fluview/fluview_update.py#L326 + SELECT + MIN(fc.release_date) as min_release_date, + fc.issue, + fc.epiweek, + fc.region + FROM + fluview_clinical fc + GROUP BY + fc.issue, + fc.epiweek, + fc.region + ) rel_date ON tmp.issue = rel_date.issue + AND tmp.epiweek = rel_date.epiweek + AND tmp.region = rel_date.region; + +DROP TABLE fluview_clinical; + +ALTER TABLE fluview_clinical_v2 RENAME fluview_clinical; \ No newline at end of file From 7cfc97d85b01c4729c09554ef9fc2ad3af147c1b Mon Sep 17 00:00:00 2001 From: nolangormley Date: Tue, 27 Jun 2023 18:16:12 +0000 Subject: [PATCH 43/43] chore: release delphi-epidata 4.1.4 --- .bumpversion.cfg | 2 +- dev/local/setup.cfg | 2 +- src/client/delphi_epidata.R | 2 +- src/client/delphi_epidata.js | 2 +- src/client/packaging/npm/package.json | 2 +- src/client/packaging/pypi/delphi_epidata/__init__.py | 2 +- src/client/packaging/pypi/setup.py | 2 +- src/server/_config.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 99c2373b6..358c9029d 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.3 +current_version = 4.1.4 commit = False tag = False diff --git a/dev/local/setup.cfg b/dev/local/setup.cfg index 69bc91778..1b5529b22 100644 --- a/dev/local/setup.cfg +++ b/dev/local/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = Delphi Development -version = 4.1.3 +version = 4.1.4 [options] packages = diff --git a/src/client/delphi_epidata.R b/src/client/delphi_epidata.R index 627948cc2..99201abd8 100644 --- a/src/client/delphi_epidata.R +++ b/src/client/delphi_epidata.R @@ -15,7 +15,7 @@ Epidata <- (function() { # API base url BASE_URL <- getOption('epidata.url', default = 'https://api.delphi.cmu.edu/epidata/') - client_version <- '4.1.3' + client_version <- '4.1.4' auth <- getOption("epidata.auth", default = NA) diff --git a/src/client/delphi_epidata.js b/src/client/delphi_epidata.js index 117fe8949..1987595d7 100644 --- a/src/client/delphi_epidata.js +++ b/src/client/delphi_epidata.js @@ -22,7 +22,7 @@ } })(this, function (exports, fetchImpl, jQuery) { const BASE_URL = "https://api.delphi.cmu.edu/epidata/"; - const client_version = "4.1.3"; + const client_version = "4.1.4"; // Helper function to cast values and/or ranges to strings function _listitem(value) { diff --git a/src/client/packaging/npm/package.json b/src/client/packaging/npm/package.json index 40c3d53a6..492d63760 100644 --- a/src/client/packaging/npm/package.json +++ b/src/client/packaging/npm/package.json @@ -2,7 +2,7 @@ "name": "delphi_epidata", "description": "Delphi Epidata API Client", "authors": "Delphi Group", - "version": "4.1.3", + "version": "4.1.4", "license": "MIT", "homepage": "https://github.com/cmu-delphi/delphi-epidata", "bugs": { diff --git a/src/client/packaging/pypi/delphi_epidata/__init__.py b/src/client/packaging/pypi/delphi_epidata/__init__.py index c8f19f67c..1e280b80c 100644 --- a/src/client/packaging/pypi/delphi_epidata/__init__.py +++ b/src/client/packaging/pypi/delphi_epidata/__init__.py @@ -1,4 +1,4 @@ from .delphi_epidata import Epidata name = 'delphi_epidata' -__version__ = '4.1.3' +__version__ = '4.1.4' diff --git a/src/client/packaging/pypi/setup.py b/src/client/packaging/pypi/setup.py index e57e565b6..5c36175b0 100644 --- a/src/client/packaging/pypi/setup.py +++ b/src/client/packaging/pypi/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="delphi_epidata", - version="4.1.3", + version="4.1.4", author="David Farrow", author_email="dfarrow0@gmail.com", description="A programmatic interface to Delphi's Epidata API.", diff --git a/src/server/_config.py b/src/server/_config.py index 168512a3d..0fa9d55e3 100644 --- a/src/server/_config.py +++ b/src/server/_config.py @@ -7,7 +7,7 @@ load_dotenv() -VERSION = "4.1.3" +VERSION = "4.1.4" MAX_RESULTS = int(10e6) MAX_COMPATIBILITY_RESULTS = int(3650)