From 93c259e7808b20a13b7511f145d12eab8355dcec Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 13:19:11 +0200 Subject: [PATCH 01/37] src\maintenance\covidcast_meta_cache_updater.py superfluous-parens Removed unneeded parenthesis --- src/maintenance/covidcast_meta_cache_updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/maintenance/covidcast_meta_cache_updater.py b/src/maintenance/covidcast_meta_cache_updater.py index cb0b2703f..ebb7c8360 100644 --- a/src/maintenance/covidcast_meta_cache_updater.py +++ b/src/maintenance/covidcast_meta_cache_updater.py @@ -26,7 +26,7 @@ def main(args, epidata_impl=Epidata, database_impl=Database): """ log_file = None num_threads = None - if (args): + if args: log_file = args.log_file num_threads = args.num_threads From 8282f0b969059fd2014f9afbd7b6309edfbe89f0 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 13:29:44 +0200 Subject: [PATCH 02/37] src\server\_query.py comparison-of-constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code was    def dummy_gen():         if 3 >4:             yield {} Clearly, the dummy_gen is not intended to reach the yield. To clarify that, I changed the condition to False. I still wonder why the function is needed. --- src/server/_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/_query.py b/src/server/_query.py index 267a78eb1..ba799e9f2 100644 --- a/src/server/_query.py +++ b/src/server/_query.py @@ -280,7 +280,7 @@ def execute_queries( query_list = list(queries) def dummy_gen(): - if 3 > 4: + if False: yield {} if not query_list or p.remaining_rows <= 0: From 24c0557fe968d3457cfb11701539acf890d98af0 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 15:28:34 +0200 Subject: [PATCH 03/37] src\acquisition\quidel\quidel.py too-many-branches Method prepare_measurements of class QuidelData had 19 branches while Pylint recommends having at most 12. I extracted the methods _prepare_device_id and to make the code more structured and solve that. --- src/acquisition/quidel/quidel.py | 37 +++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index 0540d5e7c..c2f8f98fc 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -242,7 +242,7 @@ def load_csv(self, dims=None): # hardcoded aggregation function # output: [#unique_device,fluA,fluB,fluAll,total] def prepare_measurements(self, data_dict, use_hhs=True, start_weekday=6): - buffer_dict = {} + if use_hhs: region_list = Locations.hhs_list else: @@ -265,6 +265,29 @@ def get_hhs_region(atom): end_date = sorted(data_dict.keys())[-1] # count the latest week in only if Thurs data is included end_epiweek = date_to_epiweek(end_date, shift=-4) + # first pass: prepare device_id set + device_dict = self._prepare_device_id(data_dict + , end_epiweek + , region_list + , time_map + , region_map) + + buffer_dict = self._prepare_all_measurments(data_dict + , device_dict + , end_epiweek + , time_map + , region_list + , region_map) + # switch two dims of dict + result_dict = {} + for r in region_list: + result_dict[r] = {} + for (k, v) in buffer_dict.items(): + result_dict[r][k] = v[r] + + return result_dict + + def _prepare_device_id(self, data_dict, end_epiweek, region_list, time_map, region_map): # first pass: prepare device_id set device_dict = {} for (date, daily_dict) in data_dict.items(): @@ -286,6 +309,10 @@ def get_hhs_region(atom): fac = rec[0] device_dict[ew][region].add(fac) + return device_dict + + def _prepare_all_measurments(self, data_dict, device_dict, end_epiweek, time_map, region_list, region_map): + buffer_dict = {} # second pass: prepare all measurements for (date, daily_dict) in data_dict.items(): ew = time_map(date) @@ -316,11 +343,5 @@ def get_hhs_region(atom): 1.0 / fac_num, ], ).tolist() - # switch two dims of dict - result_dict = {} - for r in region_list: - result_dict[r] = {} - for (k, v) in buffer_dict.items(): - result_dict[r][k] = v[r] - return result_dict + return buffer_dict From bc23b26a5b87fe129b8db3e5902d634df460e290 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 16:50:25 +0200 Subject: [PATCH 04/37] src\acquisition\cdcp\cdc_upload.py too-many-statements Function upload had 68 statements while Pylint recommends having at most 50. I added the cursor as a parameter to its external functions and moved them out. --- src/acquisition/cdcp/cdc_upload.py | 137 +++++++++++++++-------------- 1 file changed, 69 insertions(+), 68 deletions(-) diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index 0e191267b..4576edaed 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -161,6 +161,74 @@ `total` = %s """ +# insert (or update) table `cdc` +def insert_cdc(cur, date, page, state, num): + cur.execute(sql_cdc, (date, page, state, num, num)) + +# insert (or update) table `cdc_meta` +def insert_cdc_meta(cur, date, state, total): + cur.execute(sql_cdc_meta, (date, date, state, total, total)) + +# loop over rows until the header row is found +def find_header(reader): + for row in reader: + if len(row) > 0 and row[0] == "Date": + return True + return False + +# parse csv files for `cdc` and `cdc_meta` +def parse_csv(cur, meta): + def handler(cur, reader): + if not find_header(reader): + raise Exception("header not found") + count = 0 + cols = 3 if meta else 4 + for row in reader: + if len(row) != cols: + continue + if meta: + (a, c, d) = row + else: + (a, b, c, d) = row + c = c[:-16] + if c not in STATES: + continue + a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d") + c = STATES[c] + d = int(d) + if meta: + insert_cdc_meta(cur, a, c, d) + else: + insert_cdc(cur, a, b, c, d) + count += 1 + return count + + return handler + + +# recursively open zip files +def parse_zip(cur, zf, level=1): + for name in zf.namelist(): + prefix = " " * level + print(prefix, name) + if name[-4:] == ".zip": + with zf.open(name) as temp: + with ZipFile(io.BytesIO(temp.read())) as zf2: + parse_zip(cur, zf2, level + 1) + elif name[-4:] == ".csv": + handler = None + if "Flu Pages by Region" in name: + handler = parse_csv(cur, False) + elif "Regions for all CDC" in name: + handler = parse_csv(cur, True) + else: + print(prefix, " (skipped)") + if handler is not None: + with zf.open(name) as temp: + count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8")))) + print(prefix, f" {int(count)} rows") + else: + print(prefix, " (ignored)") def upload(test_mode): # connect @@ -168,73 +236,6 @@ def upload(test_mode): cnx = mysql.connector.connect(user=u, password=p, database="epidata") cur = cnx.cursor() - # insert (or update) table `cdc` - def insert_cdc(date, page, state, num): - cur.execute(sql_cdc, (date, page, state, num, num)) - - # insert (or update) table `cdc_meta` - def insert_cdc_meta(date, state, total): - cur.execute(sql_cdc_meta, (date, date, state, total, total)) - - # loop over rows until the header row is found - def find_header(reader): - for row in reader: - if len(row) > 0 and row[0] == "Date": - return True - return False - - # parse csv files for `cdc` and `cdc_meta` - def parse_csv(meta): - def handler(reader): - if not find_header(reader): - raise Exception("header not found") - count = 0 - cols = 3 if meta else 4 - for row in reader: - if len(row) != cols: - continue - if meta: - (a, c, d) = row - else: - (a, b, c, d) = row - c = c[:-16] - if c not in STATES: - continue - a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d") - c = STATES[c] - d = int(d) - if meta: - insert_cdc_meta(a, c, d) - else: - insert_cdc(a, b, c, d) - count += 1 - return count - - return handler - - # recursively open zip files - def parse_zip(zf, level=1): - for name in zf.namelist(): - prefix = " " * level - print(prefix, name) - if name[-4:] == ".zip": - with zf.open(name) as temp: - with ZipFile(io.BytesIO(temp.read())) as zf2: - parse_zip(zf2, level + 1) - elif name[-4:] == ".csv": - handler = None - if "Flu Pages by Region" in name: - handler = parse_csv(False) - elif "Regions for all CDC" in name: - handler = parse_csv(True) - else: - print(prefix, " (skipped)") - if handler is not None: - with zf.open(name) as temp: - count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8")))) - print(prefix, f" {int(count)} rows") - else: - print(prefix, " (ignored)") # find, parse, and move zip files zip_files = glob.glob("/common/cdc_stage/*.zip") @@ -244,7 +245,7 @@ def parse_zip(zf, level=1): print("parsing...") for f in zip_files: with ZipFile(f) as zf: - parse_zip(zf) + parse_zip(cur, zf) print("moving...") for f in zip_files: src = f From f217d2fb133615c4b1958d83f9779e7d2774bc2f Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 16:51:38 +0200 Subject: [PATCH 05/37] src\server\_printer.py too-many-return-statements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function create_printer had 7 returns while Pylint recommends having at most 6. The code checks for a list of options. I removed some into a dictionary. I also assigned the results into a variable and used a single return at the end of the function. --- src/server/_printer.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/server/_printer.py b/src/server/_printer.py index 6df6d62b9..b475e6326 100644 --- a/src/server/_printer.py +++ b/src/server/_printer.py @@ -257,17 +257,18 @@ def _end(self): def create_printer(format: str) -> APrinter: + + format_dict = {"tree": ClassicTreePrinter("signal") + , "json": JSONPrinter() + , "csv": CSVPrinter() + , "jsonl": JSONLPrinter()} + if format is None: - return ClassicPrinter() - if format == "tree": - return ClassicTreePrinter("signal") - if format.startswith("tree-"): + printer = ClassicPrinter() + elif format.startswith("tree-"): # support tree format by any property following the dash - return ClassicTreePrinter(format[len("tree-") :]) - if format == "json": - return JSONPrinter() - if format == "csv": - return CSVPrinter() - if format == "jsonl": - return JSONLPrinter() - return ClassicPrinter() \ No newline at end of file + printer = ClassicTreePrinter(format[len("tree-") :]) + else: + printer = format_dict.get(format, ClassicPrinter()) + + return printer \ No newline at end of file From f20aeeaf5ec2dac127390066ed59479fa9807892 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 17:51:12 +0200 Subject: [PATCH 06/37] src\acquisition\covidcast\csv_importer.py too-many-return-statements Static method extract_and_check_row of class CsvImporter had 14 returns while Pylint recommends having at most 6. I extracted the functions _validate_geo_type and _validate_geo_id to make the code more structured and solve that. --- src/acquisition/covidcast/csv_importer.py | 122 +++++++++++++--------- 1 file changed, 73 insertions(+), 49 deletions(-) diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index e9893c0da..be674e369 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -49,6 +49,69 @@ class CsvRowValue: missing_sample_size: int + +def _validate_geo_id(row, geo_type): + # use consistent capitalization (e.g. for states) + try: + geo_id = row.geo_id.lower() + except AttributeError: + # geo_id was `None` + return (None, 'geo_id') + + if geo_type in ('hrr', 'msa', 'dma', 'hhs'): + # these particular ids are prone to be written as ints -- and floats + try: + geo_id = str(CsvImporter.floaty_int(geo_id)) + except ValueError: + # expected a number, but got a string + return (None, 'geo_id') + + return geo_id + + +def _validate_geo_type(geo_type, geo_id): + + result = None + + # sanity check geo_id with respect to geo_type + if geo_type == 'county': + if len(geo_id) != 5 or not '01000' <= geo_id <= '80000': + result = (None, 'geo_id') + + elif geo_type == 'hrr': + if not 1 <= int(geo_id) <= 500: + result = (None, 'geo_id') + + elif geo_type == 'msa': + if len(geo_id) != 5 or not '10000' <= geo_id <= '99999': + result = (None, 'geo_id') + + elif geo_type == 'dma': + if not 450 <= int(geo_id) <= 950: + result = (None, 'geo_id') + + elif geo_type == 'state': + # note that geo_id is lowercase + if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': + result = (None, 'geo_id') + + elif geo_type == 'hhs': + if not 1 <= int(geo_id) <= 10: + result = (None, 'geo_id') + + elif geo_type == 'nation': + # geo_id is lowercase + if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': + result = (None, 'geo_id') + + else: + result = (None, 'geo_type') + + return result + + + + class CsvImporter: """Finds and parses covidcast CSV files.""" @@ -283,6 +346,7 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N return missing_entry + @staticmethod def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[str] = None) -> Tuple[Optional[CsvRowValue], Optional[str]]: """Extract and return `CsvRowValue` from a CSV row, with sanity checks. @@ -293,55 +357,15 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s geo_type: the geographic resolution of the file """ - # use consistent capitalization (e.g. for states) - try: - geo_id = row.geo_id.lower() - except AttributeError: - # geo_id was `None` - return (None, 'geo_id') - - if geo_type in ('hrr', 'msa', 'dma', 'hhs'): - # these particular ids are prone to be written as ints -- and floats - try: - geo_id = str(CsvImporter.floaty_int(geo_id)) - except ValueError: - # expected a number, but got a string - return (None, 'geo_id') - + geo_id = _validate_geo_id(row, geo_type) + if geo_id == (None, 'geo_id'): + return geo_id + # sanity check geo_id with respect to geo_type - if geo_type == 'county': - if len(geo_id) != 5 or not '01000' <= geo_id <= '80000': - return (None, 'geo_id') - - elif geo_type == 'hrr': - if not 1 <= int(geo_id) <= 500: - return (None, 'geo_id') - - elif geo_type == 'msa': - if len(geo_id) != 5 or not '10000' <= geo_id <= '99999': - return (None, 'geo_id') - - elif geo_type == 'dma': - if not 450 <= int(geo_id) <= 950: - return (None, 'geo_id') - - elif geo_type == 'state': - # note that geo_id is lowercase - if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': - return (None, 'geo_id') - - elif geo_type == 'hhs': - if not 1 <= int(geo_id) <= 10: - return (None, 'geo_id') - - elif geo_type == 'nation': - # geo_id is lowercase - if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': - return (None, 'geo_id') - - else: - return (None, 'geo_type') - + invalid = _validate_geo_type(geo_type, geo_id) + if invalid: + return invalid + # Validate row values value = CsvImporter.validate_quantity(row, "value") # value was a string or another dtype @@ -364,7 +388,6 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s # return extracted and validated row values return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None) - @staticmethod def load_csv(filepath: str, details: PathDetails) -> Iterator[Optional[CovidcastRow]]: """Load, validate, and yield data as `RowValues` from a CSV file. @@ -414,3 +437,4 @@ def load_csv(filepath: str, details: PathDetails) -> Iterator[Optional[Covidcast details.issue, details.lag, ) + From d2c414726abd68008608d59a87f76bfc36bb3bf2 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 21:36:43 +0200 Subject: [PATCH 07/37] src\acquisition\covid_hosp\common\database.py too-many-branches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Method insert_dataset of class Database had 13 branches while Pylint recommends having at most 12. I extracted the method _process_rows   to make the code more structured and solve that. --- src/acquisition/covid_hosp/common/database.py | 73 +++++++++++-------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/src/acquisition/covid_hosp/common/database.py b/src/acquisition/covid_hosp/common/database.py index 18c7f377f..563bd7a58 100644 --- a/src/acquisition/covid_hosp/common/database.py +++ b/src/acquisition/covid_hosp/common/database.py @@ -192,37 +192,14 @@ def nan_safe_dtype(dtype, value): num_values = len(dataframe.index) if logger: logger.info('updating values', count=num_values) - n = 0 - rows_affected = 0 - many_values = [] - with self.new_cursor() as cursor: - for index, row in dataframe.iterrows(): - values = [] - for c in dataframe_columns_and_types: - values.append(nan_safe_dtype(c.dtype, row[c.csv_name])) - many_values.append(id_and_publication_date + - tuple(values) + - tuple(i.csv_name for i in self.additional_fields)) - n += 1 - # insert in batches because one at a time is slow and all at once makes - # the connection drop :( - if n % 5_000 == 0: - try: - cursor.executemany(sql, many_values) - rows_affected += cursor.rowcount - many_values = [] - except Exception as e: - if logger: - logger.error('error on insert', publ_date=publication_date, in_lines=(n-5_000, n), index=index, values=values, exception=e) - raise e - # insert final batch - if many_values: - cursor.executemany(sql, many_values) - rows_affected += cursor.rowcount - if logger: - # NOTE: REPLACE INTO marks 2 rows affected for a "replace" (one for a delete and one for a re-insert) - # which allows us to count rows which were updated - logger.info('rows affected', total=rows_affected, updated=rows_affected-num_values) + self._process_rows(publication_date + , dataframe + , logger + , dataframe_columns_and_types + , nan_safe_dtype + , sql + , id_and_publication_date + , num_values) # deal with non/seldomly updated columns used like a fk table (if this database needs it) if hasattr(self, 'AGGREGATE_KEY_COLS'): @@ -261,6 +238,40 @@ def nan_safe_dtype(dtype, value): with self.new_cursor() as cur: cur.executemany(ak_insert_sql, ak_data) + def _process_rows(self, publication_date, dataframe, logger, dataframe_columns_and_types, nan_safe_dtype, sql + , id_and_publication_date, num_values): + n = 0 + rows_affected = 0 + many_values = [] + with self.new_cursor() as cursor: + for index, row in dataframe.iterrows(): + values = [] + for c in dataframe_columns_and_types: + values.append(nan_safe_dtype(c.dtype, row[c.csv_name])) + many_values.append(id_and_publication_date + + tuple(values) + + tuple(i.csv_name for i in self.additional_fields)) + n += 1 + # insert in batches because one at a time is slow and all at once makes + # the connection drop :( + if n % 5_000 == 0: + try: + cursor.executemany(sql, many_values) + rows_affected += cursor.rowcount + many_values = [] + except Exception as e: + if logger: + logger.error('error on insert', publ_date=publication_date, in_lines=(n-5_000, n), index=index, values=values, exception=e) + raise e + # insert final batch + if many_values: + cursor.executemany(sql, many_values) + rows_affected += cursor.rowcount + if logger: + # NOTE: REPLACE INTO marks 2 rows affected for a "replace" (one for a delete and one for a re-insert) + # which allows us to count rows which were updated + logger.info('rows affected', total=rows_affected, updated=rows_affected-num_values) + def get_max_issue(self, logger=False): """Fetch the most recent issue. From b25edb7a295e516c46fa0f44db67dd56c20d7730 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sat, 14 Dec 2024 21:38:34 +0200 Subject: [PATCH 08/37] src\server\endpoints\covidcast.py too-many-statements Function handle_meta had 65 statements while Pylint recommends having at most 50. I extracted _process_signals to make the code more structured and solve that. --- src/server/endpoints/covidcast.py | 46 +++++++++++++++++-------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/src/server/endpoints/covidcast.py b/src/server/endpoints/covidcast.py index 3d7d99e82..9551cdf8f 100644 --- a/src/server/endpoints/covidcast.py +++ b/src/server/endpoints/covidcast.py @@ -447,27 +447,7 @@ def handle_meta(): meta_signals: List[Dict[str, Any]] = [] - for signal in source.signals: - if filter_active is not None and signal.active != filter_active: - continue - if filter_signal and all((not s.matches(signal.source, signal.signal) for s in filter_signal)): - continue - if filter_smoothed is not None and signal.is_smoothed != filter_smoothed: - continue - if filter_weighted is not None and signal.is_weighted != filter_weighted: - continue - if filter_cumulative is not None and signal.is_cumulative != filter_cumulative: - continue - if filter_time_type is not None and signal.time_type != filter_time_type: - continue - meta_data = by_signal.get((source.db_source, signal.signal)) - if not meta_data: - continue - row = meta_data[0] - entry = CovidcastMetaEntry(signal, row["min_time"], row["max_time"], row["max_issue"]) - for row in meta_data: - entry.intergrate(row) - meta_signals.append(entry.asdict()) + _process_signals(filter_signal, filter_smoothed, filter_weighted, filter_cumulative, filter_active, filter_time_type, by_signal, source, meta_signals) if not meta_signals: # none found or no signals continue @@ -478,6 +458,30 @@ def handle_meta(): return jsonify(sources) +def _process_signals(filter_signal, filter_smoothed, filter_weighted, filter_cumulative, filter_active + , filter_time_type, by_signal, source, meta_signals): + for signal in source.signals: + if filter_active is not None and signal.active != filter_active: + continue + if filter_signal and all((not s.matches(signal.source, signal.signal) for s in filter_signal)): + continue + if filter_smoothed is not None and signal.is_smoothed != filter_smoothed: + continue + if filter_weighted is not None and signal.is_weighted != filter_weighted: + continue + if filter_cumulative is not None and signal.is_cumulative != filter_cumulative: + continue + if filter_time_type is not None and signal.time_type != filter_time_type: + continue + meta_data = by_signal.get((source.db_source, signal.signal)) + if not meta_data: + continue + row = meta_data[0] + entry = CovidcastMetaEntry(signal, row["min_time"], row["max_time"], row["max_issue"]) + for row in meta_data: + entry.intergrate(row) + meta_signals.append(entry.asdict()) + @bp.route("/coverage", methods=("GET", "POST")) def handle_coverage(): From 1310e154d92497682a686fcb1c64219db4cf0d35 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Sun, 15 Dec 2024 17:59:22 +0200 Subject: [PATCH 09/37] src\acquisition\wiki\wiki_download.py too-many-branches Function run had 24 branches while Pylint recommends having at most 12. I extracted the debug_log function (print in debug mode) which is reused a few times. I also extracted methods to make the code more structured and solve that. --- src/acquisition/wiki/wiki_download.py | 63 +++++++++++++++++---------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 6192eab02..2032b38a4 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -72,6 +72,10 @@ def get_hmac_sha256(key, msg): return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() +def debug_log(message, debug_mode=False): + if debug_mode: + print(message) + def extract_article_counts(filename, language, articles, debug_mode): """ Support multiple languages ('en' | 'es' | 'pt') @@ -150,13 +154,11 @@ def extract_article_counts_orig(articles, debug_mode): counts["total"] = total return counts - def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, debug_mode=False): worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() print(f"this is [{worker}]") - if debug_mode: - print("*** running in debug mode ***") + debug_log("*** running in debug mode ***", debug_mode) total_download = 0 passed_jobs = 0 @@ -180,10 +182,7 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d else: raise Exception(f"server response code (get) was {int(code)}") # Make the code compatible with mac os system - if platform == "darwin": - job_content = text(req.readlines()[1]) - else: - job_content = text(req.readlines()[0]) + job_content = _get_job_content(req) if job_content == "no jobs": print("no jobs available") if download_limit is None and job_limit is None: @@ -208,17 +207,10 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d # Make the code cross-platfrom, so use python to get the size of the file # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) size = os.stat("raw.gz").st_size - if debug_mode: - print(size) + debug_log(size, debug_mode) total_download += size - if job["hash"] != "00000000000000000000000000000000": - print("checking hash...") - out = text(subprocess.check_output("md5sum raw.gz", shell=True)) - result = out[0:32] - if result != job["hash"]: - raise Exception(f"wrong hash [expected {job['hash']}, got {result}]") - if debug_mode: - print(result) + _validate_hash(debug_mode, job) + print("decompressing...") subprocess.check_call("gunzip -f raw.gz", shell=True) # print 'converting case...' @@ -237,14 +229,11 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d } articles = lang2articles[language] articles = sorted(articles) - if debug_mode: - print(f"Language is {language} and target articles are {articles}") + debug_log(f"Language is {language} and target articles are {articles}", debug_mode) temp_counts = extract_article_counts("raw2", language, articles, debug_mode) counts[language] = temp_counts - if not debug_mode: - print("deleting files...") - subprocess.check_call("rm raw2", shell=True) + _remove_raw_files(debug_mode) print("saving results...") time_stop = datetime.datetime.now() result = { @@ -256,8 +245,10 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d } payload = json.dumps(result) hmac_str = get_hmac_sha256(secret, payload) - if debug_mode: - print(f" hmac: {hmac_str}") + + debug_log(f" hmac: {hmac_str}" + , debug_mode) + post_data = urlencode({"put": payload, "hmac": hmac_str}) req = urlopen(MASTER_URL, data=data(post_data)) code = req.getcode() @@ -275,6 +266,30 @@ def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, d ) time.sleep(sleep_time) + _alert_on_limit_reach(download_limit, job_limit, total_download, passed_jobs, failed_jobs) + +def _get_job_content(req): + if platform == "darwin": + job_content = text(req.readlines()[1]) + else: + job_content = text(req.readlines()[0]) + return job_content + +def _remove_raw_files(debug_mode): + if not debug_mode: + print("deleting files...") + subprocess.check_call("rm raw2", shell=True) + +def _validate_hash(debug_mode, job): + if job["hash"] != "00000000000000000000000000000000": + print("checking hash...") + out = text(subprocess.check_output("md5sum raw.gz", shell=True)) + result = out[0:32] + if result != job["hash"]: + raise Exception(f"wrong hash [expected {job['hash']}, got {result}]") + debug_log(result, debug_mode) + +def _alert_on_limit_reach(download_limit, job_limit, total_download, passed_jobs, failed_jobs): if download_limit is not None and total_download >= download_limit: print(f"download limit has been reached [{int(total_download)} >= {int(download_limit)}]") if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: From fa85df26382e126fa3e8adb2b29aaf571137eae1 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 09:54:50 +0200 Subject: [PATCH 10/37] src\acquisition\ght\ght_update.py too-many-statements Function update had 56 statements while Pylint recommends having at most 50. I extracted _process_values to make the code more structured and solve that. --- src/acquisition/ght/ght_update.py | 41 +++++++++++++++++-------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index 9e8d48d1d..97e4f3873 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -310,25 +310,7 @@ def get_num_rows(): values = [p["value"] for p in result["data"]["lines"][0]["points"]] ew = result["start_week"] num_missing = 0 - for v in values: - # Default SQL location value for US country for backwards compatibility - # i.e. California's location is still stored as 'CA', - # and having location == 'US' is still stored as 'US' - sql_location = location if location != NO_LOCATION_STR else country - - # Change SQL location for non-US countries - if country != "US": - # Underscore added to distinguish countries from 2-letter US states - sql_location = country + "_" - if location != NO_LOCATION_STR: - sql_location = sql_location + location - sql_data = (term, sql_location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - # print(' [%s|%s|%d] missing value' % (term, location, ew)) - ew = flu.add_epiweeks(ew, 1) + _process_values(cur, sql, total_rows, term, location, country, values, ew, num_missing) if num_missing > 0: print(f" [{term}|{location}] missing {int(num_missing)}/{len(values)} value(s)") except Exception as ex: @@ -343,6 +325,27 @@ def get_num_rows(): cnx.commit() cnx.close() +def _process_values(cur, sql, total_rows, term, location, country, values, ew, num_missing): + for v in values: + # Default SQL location value for US country for backwards compatibility + # i.e. California's location is still stored as 'CA', + # and having location == 'US' is still stored as 'US' + sql_location = location if location != NO_LOCATION_STR else country + + # Change SQL location for non-US countries + if country != "US": + # Underscore added to distinguish countries from 2-letter US states + sql_location = country + "_" + if location != NO_LOCATION_STR: + sql_location = sql_location + location + sql_data = (term, sql_location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + # print(' [%s|%s|%d] missing value' % (term, location, ew)) + ew = flu.add_epiweeks(ew, 1) + def main(): # args and usage From 06b088a3aebc7ac016d6a5eaa946626ab59b58cc Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 10:06:24 +0200 Subject: [PATCH 11/37] src\acquisition\nidss\taiwan_update.py wildcard-import Wildcard imports (import *) make it harder to understand what is imported from where. Removing it is also a defensive programming act, lowering the probability of collisions due to future new imports or objects. Instead, I imported explicitly delta_epiweeks, add_epiweeks from delphi.utils.epiweek (line 85) --- src/acquisition/nidss/taiwan_update.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/acquisition/nidss/taiwan_update.py b/src/acquisition/nidss/taiwan_update.py index 30d458481..762cb427f 100644 --- a/src/acquisition/nidss/taiwan_update.py +++ b/src/acquisition/nidss/taiwan_update.py @@ -82,8 +82,7 @@ # first party from .taiwan_nidss import NIDSS import delphi.operations.secrets as secrets -from delphi.utils.epiweek import * - +from delphi.utils.epiweek import delta_epiweeks, add_epiweeks # Get a row count just to know how many new rows are inserted def get_rows(cnx): From a3251c5c4bc7db16967efdc2e140357c970799e7 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 11:52:12 +0200 Subject: [PATCH 12/37] src\client\delphi_epidata.py broad-exception-caught MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catching Exception might hide unexpected exceptions (e.g., due to new code that will be added). Static method _request of class Epidata  catches exception (line 134) The try section is basically request.get Exception was changed to requests.exceptions.JSONDecodeError                    , requests.exceptions.HTTPError   , requests.exceptions.Timeout                    , requests.exceptions.TooManyRedirects  , and requests.exceptions.RequestException For details see https://requests.readthedocs.io/en/latest/user/quickstart/#make-a-request Same happened in static method _version_check (line 63). Added there also AttributeError due to looking for ['info']['version']  in line 62. --- src/client/delphi_epidata.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/client/delphi_epidata.py b/src/client/delphi_epidata.py index 998c85281..8f7a3233c 100644 --- a/src/client/delphi_epidata.py +++ b/src/client/delphi_epidata.py @@ -60,7 +60,12 @@ def _version_check(): try: request = requests.get('https://pypi.org/pypi/delphi-epidata/json', timeout=5) latest_version = request.json()['info']['version'] - except Exception as e: + except (requests.exceptions.JSONDecodeError + , requests.exceptions.HTTPError + , requests.exceptions.Timeout + , requests.exceptions.TooManyRedirects + , requests.exceptions.RequestException + , AttributeError) as e: Epidata.log("Error getting latest client version", exception=str(e)) return @@ -126,7 +131,11 @@ def _request(endpoint, params={}): """ try: result = Epidata._request_with_retry(endpoint, params) - except Exception as e: + except (requests.exceptions.JSONDecodeError + , requests.exceptions.HTTPError + , requests.exceptions.Timeout + , requests.exceptions.TooManyRedirects + , requests.exceptions.RequestException) as e: return {"result": 0, "message": "error: " + str(e)} if params is not None and "format" in params and params["format"] == "csv": return result.text From 6f627b92cae31f08957d9ca9201b73cb68b8c38d Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 13:38:53 +0200 Subject: [PATCH 13/37] src\acquisition\kcdc\kcdc_update.py broad-exception-caught Catching Exception might hide unexpected exceptions (e.g., due to new code that will be added). Function update_from_data catches exception (line 152) The try section is cursor's execute Exception was changed to mysql.connector.Error For details see https://dev.mysql.com/doc/connector-python/en/connector-python-api-errors-error.html --- src/acquisition/kcdc/kcdc_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index 713b21f00..b1101c44f 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -149,7 +149,7 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): update_args = [date, ili] try: insert.execute(sql % tuple(insert_args + update_args)) - except Exception: + except mysql.connector.Error: pass # cleanup From 8d1c155334194a53bbc74355ac39d37ce5b345cd Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 13:39:39 +0200 Subject: [PATCH 14/37] src\server\endpoints\sensors.py line-too-long Made query more readable --- src/server/endpoints/sensors.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/server/endpoints/sensors.py b/src/server/endpoints/sensors.py index 1c76f47ac..8a6304dd0 100644 --- a/src/server/endpoints/sensors.py +++ b/src/server/endpoints/sensors.py @@ -55,7 +55,8 @@ def _authenticate(names: List[str]): if unauthenticated_or_nonexistent_sensors: raise EpiDataException( - f"unauthenticated/nonexistent sensor(s): {','.join(unauthenticated_or_nonexistent_sensors)}" + ( f"unauthenticated/nonexistent sensor(s): " + + f"{','.join(unauthenticated_or_nonexistent_sensors)}") ) @@ -83,7 +84,10 @@ def handle(): # build the epiweek filter condition_epiweek = filter_integers("s.`epiweek`", epiweeks, "epiweek", params) # the query - query = f"SELECT {fields} FROM {table} WHERE ({condition_name}) AND ({condition_location}) AND ({condition_epiweek}) ORDER BY {order}" + query = (f"SELECT {fields} " + + f"FROM {table} " + + f"WHERE ({condition_name}) AND ({condition_location}) AND ({condition_epiweek})" + + f" ORDER BY {order}") fields_string = ["name", "location"] fields_int = ["epiweek"] From e229426cb56ed68fdf023f485ecfaaa4ed5c46dc Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 13:50:04 +0200 Subject: [PATCH 15/37] src\acquisition\paho\paho_db_update.py line-too-long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made unreadable line shorter --- src/acquisition/paho/paho_db_update.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index b351d3ff2..e8ba99d64 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -118,11 +118,13 @@ def get_rows(cnx, table="paho_dengue"): def get_paho_row(row): - if row[ + if (row[ 0 - ] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split( - "," - ): + ] == "\ufeffIncidence Rate (c)" + and row != ("\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f)," + + "ID,Country or Subregion,Deaths,EW,Confirmed," + + "Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d)," + + "Total of Dengue Cases (b),Year,Population x 1000").split(",")): raise Exception("PAHO header row has changed") if len(row) == 1 or row[0] == "Incidence Rate (c)": # this is a header row From 3accb562bf305e0995387221a889f0951f0d45a4 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 13:54:05 +0200 Subject: [PATCH 16/37] src\server\_limiter.py line-too-long Made readable lines shorter --- src/server/_limiter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/server/_limiter.py b/src/server/_limiter.py index c54a2141c..56c97ff79 100644 --- a/src/server/_limiter.py +++ b/src/server/_limiter.py @@ -8,7 +8,8 @@ from ._config import RATE_LIMIT, RATELIMIT_STORAGE_URL, REDIS_HOST, REDIS_PASSWORD from ._exceptions import ValidationFailedException from ._params import extract_dates, extract_integers, extract_strings, parse_source_signal_sets -from ._security import _is_public_route, current_user, resolve_auth_token, ERROR_MSG_RATE_LIMIT, ERROR_MSG_MULTIPLES +from ._security import (_is_public_route, current_user, resolve_auth_token + , ERROR_MSG_RATE_LIMIT, ERROR_MSG_MULTIPLES) @@ -61,7 +62,8 @@ def get_multiples_count(request): if len(vals) >= 2: multiple_selection_allowed -= 1 elif len(vals) and isinstance(vals, list) and isinstance(vals[0], tuple): - # else we have one val which is a tuple, representing a range, and thus is a "multiple" + # else we have one val which is a tuple, + # representing a range, and thus is a "multiple" multiple_selection_allowed -= 1 except ValidationFailedException: continue From 241a5185643cf2634d3be4c91874362ddadb0411 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 13:59:35 +0200 Subject: [PATCH 17/37] src\server\covidcast_issues_migration\proc_db_backups_pd.py line-too-long Made readable lines shorter --- src/server/covidcast_issues_migration/proc_db_backups_pd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/server/covidcast_issues_migration/proc_db_backups_pd.py b/src/server/covidcast_issues_migration/proc_db_backups_pd.py index 1aa2cbe1b..fd06efec5 100755 --- a/src/server/covidcast_issues_migration/proc_db_backups_pd.py +++ b/src/server/covidcast_issues_migration/proc_db_backups_pd.py @@ -25,7 +25,8 @@ # Column names INDEX_COLS = ["source", "signal", "time_type", "geo_type", "time_value", "geo_value"] -VALUE_COLS = ["value_updated_timestamp", "value", "stderr", "sample_size", "direction_updated_timestamp", "direction"] +VALUE_COLS = ["value_updated_timestamp", "value", "stderr", "sample_size" + , "direction_updated_timestamp", "direction"] ALL_COLS = INDEX_COLS + VALUE_COLS ALL_COLS_WITH_PK = ["id"] + ALL_COLS @@ -469,7 +470,8 @@ def generate_issues( row_fmt = "(" \ "{id},{source},{signal},{time_type},{geo_type},{time_value},{geo_value}," \ - "{row.value_updated_timestamp},{row.value},{row.stderr},{row.sample_size},{row.direction_updated_timestamp},{row.direction}," \ + "{row.value_updated_timestamp},{row.value},{row.stderr},{row.sample_size}" \ + ",{row.direction_updated_timestamp},{row.direction}," \ "{issue},{row.lag})" try: From a07e7caac84e6d5030178044d2ebbabcb1723ffd Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 14:42:17 +0200 Subject: [PATCH 18/37] src\maintenance\remove_outdated_keys.py line-too-long Made readable lines shorter. Email body is more readable. --- src/maintenance/remove_outdated_keys.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/maintenance/remove_outdated_keys.py b/src/maintenance/remove_outdated_keys.py index 56fea0cf9..c5b58ce06 100644 --- a/src/maintenance/remove_outdated_keys.py +++ b/src/maintenance/remove_outdated_keys.py @@ -14,8 +14,9 @@ EMAIL_FROM = "noreply@andrew.cmu.edu" ALERT_EMAIL_MESSAGE = f"""Hi! \n Your API Key is going to be removed due to inactivity. To renew it, pelase use it within one month from now.""" -DELETED_EMAIL_MESSAGE = f"""Hi! \n Your API Key was removed due to inactivity. -To get new one, please use registration form ({API_KEY_REGISTRATION_FORM_LINK_LOCAL}) or contact us.""" +DELETED_EMAIL_MESSAGE = (f"""Hi! \n Your API Key was removed due to inactivity. +To get new one, please use registration form ({API_KEY_REGISTRATION_FORM_LINK_LOCAL})""" ++ """ or contact us.""") def get_old_keys(cur): @@ -43,7 +44,10 @@ def remove_outdated_key(cur, api_key): def send_notification(to_addr, alert=True): message = ALERT_EMAIL_MESSAGE if alert else DELETED_EMAIL_MESSAGE - BODY = "\r\n".join((f"FROM: {EMAIL_FROM}", f"TO: {to_addr}", f"Subject: {EMAIL_SUBJECT}", "", message)) + BODY = "\r\n".join((f"FROM: {EMAIL_FROM}" + , f"TO: {to_addr}" + , f"Subject: {EMAIL_SUBJECT}" + , "", message)) smtp_server = SMTP(host=SMTP_HOST, port=SMTP_PORT) smtp_server.starttls() smtp_server.sendmail(EMAIL_FROM, to_addr, BODY) From 42612bf68e6f24976784666027740ae08a0ac08b Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 14:45:37 +0200 Subject: [PATCH 19/37] src\acquisition\wiki\wiki_util.py line-too-long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made unreadable line shorter --- src/acquisition/wiki/wiki_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/acquisition/wiki/wiki_util.py b/src/acquisition/wiki/wiki_util.py index 55bf3e2ca..d0056e83c 100644 --- a/src/acquisition/wiki/wiki_util.py +++ b/src/acquisition/wiki/wiki_util.py @@ -1,6 +1,7 @@ class Articles: - # Notice that all languages must be two chars, because that `language` column in table `wiki` is CHAR(2) + # Notice that all languages must be two chars, + # because that `language` column in table `wiki` is CHAR(2) available_languages = ["en", "es", "pt"] en_articles_flu = [ From 18cef976b86d39acba2a57e701001d840be1a146 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:13:21 +0200 Subject: [PATCH 20/37] src\acquisition\fluview\impute_missing_values.py line-too-long Made readable line shorter --- src/acquisition/fluview/impute_missing_values.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index c795d9cce..88699712f 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -135,7 +135,10 @@ class Sql: def connect(self): """Connect to the database.""" u, p = secrets.db.epi - self.cnx = mysql.connector.connect(user=u, password=p, database="epidata", host=secrets.db.host) + self.cnx = mysql.connector.connect(user=u + , password=p + , database="epidata" + , host=secrets.db.host) self.cur = self.cnx.cursor() def close(self, commit): From a3099a502146b07d8660287db8c0cae8c70ec567 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:40:05 +0200 Subject: [PATCH 21/37] scripts\report_missing_covidcast_meta.py line-too-long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made unreadable line shorter --- scripts/report_missing_covidcast_meta.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/report_missing_covidcast_meta.py b/scripts/report_missing_covidcast_meta.py index 6346541b9..bcc3b0129 100644 --- a/scripts/report_missing_covidcast_meta.py +++ b/scripts/report_missing_covidcast_meta.py @@ -38,7 +38,12 @@ def compute_missing_signals() -> List[Tuple[Tuple[str, str], Dict]]: def gen_row(source: str, signal: str, info: Dict) -> Dict: - is_weighted = signal.startswith('smoothed_w') and not (signal.startswith('smoothed_wa') or signal.startswith('smoothed_we') or signal.startswith('smoothed_wi') or signal.startswith('smoothed_wo') or signal.startswith('smoothed_wu')) + is_weighted = (signal.startswith('smoothed_w') + and not (signal.startswith('smoothed_wa') + or signal.startswith('smoothed_we') + or signal.startswith('smoothed_wi') + or signal.startswith('smoothed_wo') + or signal.startswith('smoothed_wu'))) base_name = signal.replace('smoothed_w', 'smoothed_') if is_weighted else signal bool_str = lambda x: 'TRUE' if x else 'FALSE' From c28759d3ea7e2009b48210707f309e86331759bb Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:40:46 +0200 Subject: [PATCH 22/37] src\server\endpoints\fluview_meta.py line-too-long Made query more readable --- src/server/endpoints/fluview_meta.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/server/endpoints/fluview_meta.py b/src/server/endpoints/fluview_meta.py index c5ef8c894..87f5d17ac 100644 --- a/src/server/endpoints/fluview_meta.py +++ b/src/server/endpoints/fluview_meta.py @@ -9,7 +9,10 @@ def meta_fluview(): - query = "SELECT max(`release_date`) `latest_update`, max(`issue`) `latest_issue`, count(1) `table_rows` FROM `fluview`" + query = ("SELECT max(`release_date`) `latest_update`" + + ", max(`issue`) `latest_issue`" + + ", count(1) `table_rows` " + + " FROM `fluview`") fields_string = ["latest_update"] fields_int = ["latest_issue", "table_rows"] return parse_result(query, {}, fields_string, fields_int, None) From dd809dcca4b424edbb7e1ced230f1a18c409f1c2 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:46:53 +0200 Subject: [PATCH 23/37] src\server\endpoints\covidcast_meta.py line-too-long Made query more readable --- src/server/endpoints/covidcast_meta.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index 8c2219ae7..3cbe55d34 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -42,7 +42,10 @@ def handle(): metadata = db.execute( text( - "SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age, epidata FROM covidcast_meta_cache LIMIT 1" + "SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age" + + ", epidata " + + "FROM covidcast_meta_cache " + + " LIMIT 1" ) ).fetchone() @@ -103,6 +106,7 @@ def cache_entry_gen(): headers={ "Cache-Control": f"max-age={standard_age}, public", "Age": f"{reported_age}", - # TODO?: "Expires": f"{}", # superseded by Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expires + # TODO?: "Expires": f"{}", # superseded by Cache-Control: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expires } ) From 15402fda69b5b5d97c676e9c2e30a3a6df0ff761 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:50:29 +0200 Subject: [PATCH 24/37] src\server\utils\__init__.py line-too-long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made unreadable line shorter --- src/server/utils/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/server/utils/__init__.py b/src/server/utils/__init__.py index 2e99dfeba..648e17b48 100644 --- a/src/server/utils/__init__.py +++ b/src/server/utils/__init__.py @@ -1 +1,5 @@ -from .dates import shift_day_value, day_to_time_value, time_value_to_iso, time_value_to_day, days_in_range, weeks_in_range, shift_week_value, week_to_time_value, time_value_to_week, guess_time_value_is_day, guess_time_value_is_week, time_values_to_ranges, days_to_ranges, weeks_to_ranges, IntRange, TimeValues +from .dates import (shift_day_value, day_to_time_value, time_value_to_iso + , time_value_to_day, days_in_range, weeks_in_range, shift_week_value + , week_to_time_value, time_value_to_week, guess_time_value_is_day + , guess_time_value_is_week, time_values_to_ranges, days_to_ranges + , weeks_to_ranges, IntRange, TimeValues) From 937012fda07e95c9834b0fe6547d5e7871332155 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:55:46 +0200 Subject: [PATCH 25/37] src\maintenance\update_last_usage.py line-too-long Used a single assignment per variable, made query more readable --- src/maintenance/update_last_usage.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/maintenance/update_last_usage.py b/src/maintenance/update_last_usage.py index 85596778a..bdc4c9538 100644 --- a/src/maintenance/update_last_usage.py +++ b/src/maintenance/update_last_usage.py @@ -19,13 +19,16 @@ def main(): redis_keys = redis_cli.keys(pattern=LAST_USED_KEY_PATTERN) today_date = dtime.today().date() for key in redis_keys: - api_key, last_time_used = str(key).split("/")[1], dtime.strptime(str(redis_cli.get(key)), "%Y-%m-%d").date() + api_key = str(key).split("/")[1] + last_time_used = dtime.strptime(str(redis_cli.get(key)), "%Y-%m-%d").date() cur.execute( f""" UPDATE api_user SET last_time_used = "{last_time_used}" - WHERE api_key = "{api_key}" AND (last_time_used < "{last_time_used}" OR last_time_used IS NULL) + WHERE api_key = "{api_key}" + AND (last_time_used < "{last_time_used}" + OR last_time_used IS NULL) """ ) redis_cli.delete(key) From cddf0afd35e65dba90dda055da71d27b128c637c Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 15:58:51 +0200 Subject: [PATCH 26/37] src\acquisition\ght\google_health_trends.py line-too-long Made readable lines shorter --- src/acquisition/ght/google_health_trends.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 4bb8df25f..f1c69c53c 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -72,8 +72,10 @@ def get_data(self, start_week, end_week, location, term, resolution="week", coun "timelineResolution": resolution, } # We have a special check for the US for backwards compatibility. - # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. - # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. + # i.e. if the country is 'US' AND the location is 'US' + # , just put the geo-restriction for country. + # In contrast, another country might have a sub-region with initials 'US' + # and we want the region restriction instead. if country == "US": if location == "US" or location == NO_LOCATION_STR: params["geoRestriction_country"] = "US" From f14852c152cf5212fe44923b8031b3eeecd85a0d Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:01:43 +0200 Subject: [PATCH 27/37] src\server\endpoints\covid_hosp_facility_lookup.py line-too-long Made readable lines shorter --- src/server/endpoints/covid_hosp_facility_lookup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/server/endpoints/covid_hosp_facility_lookup.py b/src/server/endpoints/covid_hosp_facility_lookup.py index 751dfebb3..233e760fb 100644 --- a/src/server/endpoints/covid_hosp_facility_lookup.py +++ b/src/server/endpoints/covid_hosp_facility_lookup.py @@ -20,7 +20,8 @@ def handle(): # build query q = QueryBuilder("covid_hosp_facility_key", "c") q.fields = ", ".join( - [ # NOTE: fields `geocoded_hospital_address` and `hhs_ids` are available but not being provided by this endpoint. + [ # NOTE: fields `geocoded_hospital_address` and `hhs_ids` + # are available but not being provided by this endpoint. f"{q.alias}.hospital_pk", f"{q.alias}.state", f"{q.alias}.ccn", From a921f7b685e59fecfab37c17cbbd7fcbd68104ae Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:04:59 +0200 Subject: [PATCH 28/37] src\server\_pandas.py line-too-long Made readable lines shorter --- src/server/_pandas.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/server/_pandas.py b/src/server/_pandas.py index 68cbc8833..252e1e68e 100644 --- a/src/server/_pandas.py +++ b/src/server/_pandas.py @@ -12,10 +12,17 @@ from ._exceptions import DatabaseErrorException -def as_pandas(query: str, params: Dict[str, Any], db_engine: Engine = engine, parse_dates: Optional[Dict[str, str]] = None, limit_rows = MAX_RESULTS+1) -> pd.DataFrame: +def as_pandas(query: str + , params: Dict[str, Any] + , db_engine: Engine = engine + , parse_dates: Optional[Dict[str, str]] = None + , limit_rows = MAX_RESULTS+1) -> pd.DataFrame: try: query = limit_query(query, limit_rows) - return pd.read_sql_query(text(str(query)), db_engine, params=params, parse_dates=parse_dates) + return pd.read_sql_query(text(str(query)) + , db_engine + , params=params + , parse_dates=parse_dates) except Exception as e: raise DatabaseErrorException(str(e)) From 99d03a56f27af8cba4683eaed5f83f7f71293cc0 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:07:16 +0200 Subject: [PATCH 29/37] src\server\endpoints\covidcast_utils\meta.py line-too-long --- src/server/endpoints/covidcast_utils/meta.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/server/endpoints/covidcast_utils/meta.py b/src/server/endpoints/covidcast_utils/meta.py index 9188351be..e993eb75b 100644 --- a/src/server/endpoints/covidcast_utils/meta.py +++ b/src/server/endpoints/covidcast_utils/meta.py @@ -26,7 +26,10 @@ def intergrate(self, row: Dict[str, Any]): self.max_time = row["max_time"] if row["max_issue"] > self.max_issue: self.max_issue = row["max_issue"] - self.geo_types[row["geo_type"]] = CovidcastMetaStats(row["min_value"], row["mean_value"], row["stdev_value"], row["max_value"]) + self.geo_types[row["geo_type"]] = CovidcastMetaStats(row["min_value"] + , row["mean_value"] + , row["stdev_value"] + , row["max_value"]) def asdict(self): r = asdict(self) From 106d5c00ad2b8541ee6a5fd004c366cedd24b010 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:10:19 +0200 Subject: [PATCH 30/37] src\server\main.py line-too-long Made readable line shorter --- src/server/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/server/main.py b/src/server/main.py index 9d308c8ac..f4f4db045 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -22,7 +22,8 @@ environment = os.environ.get('SENTRY_ENVIRONMENT', 'development'), profiles_sample_rate = float(os.environ.get('SENTRY_PROFILES_SAMPLE_RATE', 1.0)), traces_sample_rate = float(os.environ.get('SENTRY_TRACES_SAMPLE_RATE', 1.0)), - attach_stacktrace = os.environ.get('SENTRY_ATTACH_STACKTRACE', 'False').lower() in ('true', '1', 't'), + attach_stacktrace = (os.environ.get('SENTRY_ATTACH_STACKTRACE', 'False').lower() + in ('true', '1', 't')), debug = os.environ.get('SENTRY_DEBUG', 'False').lower() in ('true', '1', 't') ) From 282188e41644368ea084c9f4cfcc30b6b506b6a1 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:14:25 +0200 Subject: [PATCH 31/37] src\server\_exceptions.py line-too-long --- src/server/_exceptions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/server/_exceptions.py b/src/server/_exceptions.py index 86ef028c1..55d37ad80 100644 --- a/src/server/_exceptions.py +++ b/src/server/_exceptions.py @@ -22,7 +22,9 @@ def __init__(self, message: str, status_code: int = 500): class MissingOrWrongSourceException(EpiDataException): def __init__(self, endpoints: Iterable[str]): - super(MissingOrWrongSourceException, self).__init__(f"no data source specified, possible values: {','.join(endpoints)}", 400) + super(MissingOrWrongSourceException, self).__init__( + f"no data source specified, possible values: {','.join(endpoints)}" + , 400) class ValidationFailedException(EpiDataException): From 33b5e0b9504f80f881b0766c9cd24c067f7024d4 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:19:44 +0200 Subject: [PATCH 32/37] src\acquisition\twtr\healthtweets.py line-too-long Made readable lines shorter --- src/acquisition/twtr/healthtweets.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index c1e345162..c8b1cee83 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -95,11 +95,9 @@ def __init__(self, username, password, debug=False): self.debug = debug self.session = requests.Session() # spoof a web browser - self.session.headers.update( - { - "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", - } - ) + self.session.headers.update({"User-Agent": + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", + }) # get the login token response = self._go("https://www.healthtweets.org/accounts/login") token = self._get_token(response.text) @@ -123,7 +121,8 @@ def get_values(self, state, date1, date2): date2: the last date in the range, inclusive (format: YYYY-MM-DD) returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) """ - # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) + # get raw values (number of flu tweets) and + # normalized values (flu tweets as a percent of total tweets) raw_values = self._get_values(state, date1, date2, False) normalized_values = self._get_values(state, date1, date2, True) values = {} From 59a0ba100bf58c44f1572c2f14f90a073f3bae99 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:23:20 +0200 Subject: [PATCH 33/37] src\maintenance\signal_dash_data_generator.py line-too-long Made url more readable --- src/maintenance/signal_dash_data_generator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/maintenance/signal_dash_data_generator.py b/src/maintenance/signal_dash_data_generator.py index 5a7067f83..fab6d9ce2 100644 --- a/src/maintenance/signal_dash_data_generator.py +++ b/src/maintenance/signal_dash_data_generator.py @@ -20,7 +20,10 @@ LOOKBACK_DAYS_FOR_COVERAGE = 56 BASE_COVIDCAST = covidcast.covidcast.Epidata.BASE_URL + "/covidcast" -COVERAGE_URL = f"{BASE_COVIDCAST}/coverage?format=csv&signal={{source}}:{{signal}}&days={LOOKBACK_DAYS_FOR_COVERAGE}" +COVERAGE_URL = (f"{BASE_COVIDCAST}/coverage?" + + "format=csv" + + "&signal={{source}}:{{signal}}" + + "&days={LOOKBACK_DAYS_FOR_COVERAGE}") @dataclass class DashboardSignal: From 35488ff016d53fcd4049b2f5b8c3c4c3ea3c34e4 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:28:40 +0200 Subject: [PATCH 34/37] src\server\endpoints\covid_hosp_facility.py line-too-long Made unreadable query structured --- src/server/endpoints/covid_hosp_facility.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/server/endpoints/covid_hosp_facility.py b/src/server/endpoints/covid_hosp_facility.py index b8e40d036..d05b28053 100644 --- a/src/server/endpoints/covid_hosp_facility.py +++ b/src/server/endpoints/covid_hosp_facility.py @@ -151,8 +151,14 @@ def handle(): q.where_integers("publication_date", publication_dates) else: # final query using most recent issues - condition = f"x.max_publication_date = {q.alias}.publication_date AND x.collection_week = {q.alias}.collection_week AND x.hospital_pk = {q.alias}.hospital_pk" - q.subquery = f"JOIN (SELECT max(publication_date) max_publication_date, collection_week, hospital_pk FROM {q.table} WHERE {q.conditions_clause} GROUP BY collection_week, hospital_pk) x ON {condition}" + condition = (f"x.max_publication_date = {q.alias}.publication_date " + + f"AND x.collection_week = {q.alias}.collection_week " + + f"AND x.hospital_pk = {q.alias}.hospital_pk") + q.subquery = (f"JOIN (SELECT max(publication_date) max_publication_date" + + f", collection_week, hospital_pk " + + f" FROM {q.table} " + + f"WHERE {q.conditions_clause} " + + f"GROUP BY collection_week, hospital_pk) x ON {condition}") q.condition = [] # since used for join # send query From d24a6418b42d75216d6fc432281e44c72ce8b6e6 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:32:02 +0200 Subject: [PATCH 35/37] src\server\endpoints\delphi.py line-too-long made readable query more structured --- src/server/endpoints/delphi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/server/endpoints/delphi.py b/src/server/endpoints/delphi.py index cb7efcd46..8d2e9c9d2 100644 --- a/src/server/endpoints/delphi.py +++ b/src/server/endpoints/delphi.py @@ -17,7 +17,10 @@ def handle(): epiweek = int(request.values["epiweek"]) # build query - query = "SELECT `system`, `epiweek`, `json` FROM `forecasts` WHERE `system` = :system AND `epiweek` = :epiweek LIMIT 1" + query = ("SELECT `system`, `epiweek`, `json` " + +"FROM `forecasts` " + +"WHERE `system` = :system AND `epiweek` = :epiweek " + +"LIMIT 1") params = dict(system=system, epiweek=epiweek) fields_string = ["system", "json"] From a0f9a3b8a50801a4ca2993579367e70c56e92427 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:35:25 +0200 Subject: [PATCH 36/37] src\common\covidcast_row.py line-too-long Made readable line shorter --- src/common/covidcast_row.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/covidcast_row.py b/src/common/covidcast_row.py index fc81d3a6c..cab83b5d3 100644 --- a/src/common/covidcast_row.py +++ b/src/common/covidcast_row.py @@ -109,7 +109,8 @@ def as_api_row_df(self, ignore_fields: Optional[List[str]] = None) -> pd.DataFra return self.as_dataframe(ignore_fields=self._api_row_ignore_fields + (ignore_fields or [])) # fmt: off - def as_api_compatibility_row_df(self, ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: + def as_api_compatibility_row_df(self + , ignore_fields: Optional[List[str]] = None) -> pd.DataFrame: """Returns a dataframe view into the row with the fields returned by the old API server (the PHP server).""" # fmt: on From 7f8b16c37de0b44ec2316955a1c9f802c6c89f33 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 16 Dec 2024 16:39:28 +0200 Subject: [PATCH 37/37] src\acquisition\covid_hosp\common\utils.py line-too-long Made readable lines shorter --- src/acquisition/covid_hosp/common/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/acquisition/covid_hosp/common/utils.py b/src/acquisition/covid_hosp/common/utils.py index b72aec8f2..9e8e8c03c 100644 --- a/src/acquisition/covid_hosp/common/utils.py +++ b/src/acquisition/covid_hosp/common/utils.py @@ -126,7 +126,10 @@ def issues_to_fetch(metadata, newer_than, older_than, logger=False): if logger: if n_beyond > 0: logger.info("issues available beyond selection", on_or_newer=older_than, count=n_beyond) - logger.info("issues selected", newer_than=str(newer_than), older_than=str(older_than), count=n_selected) + logger.info("issues selected" + , newer_than=str(newer_than) + , older_than=str(older_than) + , count=n_selected) return daily_issues @staticmethod @@ -239,7 +242,8 @@ def update_dataset(database, network, newer_than=None, older_than=None): all_metadata )) tot_revs = sum(len(revisions) for revisions in daily_issues.values()) - logger.info(f"{len(daily_issues)} issues checked w/ {tot_revs} revisions, resulting in {len(datasets)} datasets.") + logger.info(f"{len(daily_issues)} issues checked w/ {tot_revs} revisions" + + f", resulting in {len(datasets)} datasets.") if not datasets: logger.info("nothing to do, exiting") return False