diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 637eeb624..284b6049a 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -16,7 +16,7 @@ jobs: if: github.event.pull_request.draft == false strategy: matrix: - packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, doctor_visits, dsew_community_profile, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot] + packages: [_delphi_utils_python, changehc, claims_hosp, doctor_visits, dsew_community_profile, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot] defaults: run: working-directory: ${{ matrix.packages }} diff --git a/combo_cases_and_deaths/.pylintrc b/combo_cases_and_deaths/.pylintrc deleted file mode 100644 index f30837c7e..000000000 --- a/combo_cases_and_deaths/.pylintrc +++ /dev/null @@ -1,22 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/combo_cases_and_deaths/Makefile b/combo_cases_and_deaths/Makefile deleted file mode 100644 index bc88f1fec..000000000 --- a/combo_cases_and_deaths/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -.PHONY = venv, lint, test, clean - -dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*' | head -1) -venv: - python3.8 -m venv env - -install: venv - . env/bin/activate; \ - pip install wheel ; \ - pip install -e ../_delphi_utils_python ;\ - pip install -e . - -install-ci: venv - . env/bin/activate; \ - pip install wheel ; \ - pip install ../_delphi_utils_python ;\ - pip install . - -lint: - . env/bin/activate; pylint $(dir) - . env/bin/activate; pydocstyle $(dir) - -test: - . env/bin/activate ;\ - (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) - -clean: - rm -rf env - rm -f params.json diff --git a/combo_cases_and_deaths/README.md b/combo_cases_and_deaths/README.md deleted file mode 100644 index ff0b4bab5..000000000 --- a/combo_cases_and_deaths/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# MODULE NAME - - - -## Running the Indicator - -The indicator is run by directly executing the Python module contained in this -directory. The safest way to do this is to create a virtual environment, -installed the common DELPHI tools, and then install the module and its -dependencies. To do this, run the following code from this directory: - -``` -make install -``` - -This command will install the package in editable mode, so you can make changes that -will automatically propagate to the installed package. - -All of the user-changable parameters are stored in `params.json`. To execute -the module and produce the output datasets (by default, in `receiving`), run -the following: - -``` -env/bin/python -m delphi_combo_cases_and_deaths -``` - -If you want to enter the virtual environment in your shell, -you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. - -Once you are finished, you can remove the virtual environment and -params file with the following: - -``` -make clean -``` - -## Testing the code - -To run static tests of the code style, run the following command: - -``` -make lint -``` - -Unit tests are also included in the module. To execute these, run the following -command from this directory: - -``` -make test -``` - -To run individual tests, run the following: - -``` -(cd tests && ../env/bin/pytest .py --cov=delphi_combo_cases_and_deaths --cov-report=term-missing) -``` - -The output will show the number of unit tests that passed and failed, along -with the percentage of code covered by the tests. - -None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and -should not include critical sub-routines. diff --git a/combo_cases_and_deaths/REVIEW.md b/combo_cases_and_deaths/REVIEW.md deleted file mode 100644 index 93a5a6579..000000000 --- a/combo_cases_and_deaths/REVIEW.md +++ /dev/null @@ -1,39 +0,0 @@ -## Code Review (Python) - -A code review of this module should include a careful look at the code and the -output. To assist in the process, but certainly not in replace of it, please -check the following items. - -**Documentation** - -- [ ] the README.md file template is filled out and currently accurate; it is -possible to load and test the code using only the instructions given -- [ ] minimal docstrings (one line describing what the function does) are -included for all functions; full docstrings describing the inputs and expected -outputs should be given for non-trivial functions - -**Structure** - -- [ ] code should use 4 spaces for indentation; other style decisions are -flexible, but be consistent within a module -- [ ] any required metadata files are checked into the repository and placed -within the directory `static` -- [ ] any intermediate files that are created and stored by the module should -be placed in the directory `cache` -- [ ] final expected output files to be uploaded to the API are placed in the -`receiving` directory; output files should not be committed to the respository -- [ ] all options and API keys are passed through the file `params.json` -- [ ] template parameter file (`params.json.template`) is checked into the -code; no personal (i.e., usernames) or private (i.e., API keys) information is -included in this template file - -**Testing** - -- [ ] module can be installed in a new virtual environment -- [ ] pylint with the default `.pylint` settings run over the module produces -minimal warnings; warnings that do exist have been confirmed as false positives -- [ ] reasonably high level of unit test coverage covering all of the main logic -of the code (e.g., missing coverage for raised errors that do not currently seem -possible to reach are okay; missing coverage for options that will be needed are -not) -- [ ] all unit tests run without errors diff --git a/combo_cases_and_deaths/cache/.gitignore b/combo_cases_and_deaths/cache/.gitignore deleted file mode 100644 index e69de29bb..000000000 diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/README.md b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/README.md deleted file mode 100644 index 859d8a4a0..000000000 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Combined Cases and Deaths - -We create a combined cases and deaths signal for visualization only (not available in covidcast API). -It includes all of the information in usa-facts and Puerto Rico only from jhu-csse. - -## Running the Indicator - -The indicator is run by directly executing the Python script run.py. -The safest way to do this is to create a virtual environment, -installed the common DELPHI tools, and then install the module and its -dependencies. To do this, run the following code from this directory: - -``` -python -m venv env -source env/bin/activate -pip install ../_delphi_utils_python/. -pip install covidcast -``` - -To execute the script and produce the output datasets (by default, in `receiving`), run -the following: - -``` -env/bin/python run.py -``` -By default, the script will generate the combined signal for the most recent data only (usually for yesterday only). -If you want to produce the combined signal for all the dates back to the first valid date, run the following: -``` -env/bin/python run.py --date_range all -``` -If you want to set a specific date range, run the following: -``` -env/bin/python run.py --date_range yyyymmdd-yyyymmdd -``` - -Once you are finished with the code, you can deactivate the virtual environment and (optionally) remove the environment itself. -``` -deactivate -rm -r env -``` - diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/__init__.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/__init__.py deleted file mode 100644 index c8e0a9417..000000000 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -"""Module to combine the JHU and USA Facts indicators. - -This file defines the functions that are made public by the module. As the -module is intended to be executed though the main method, these are primarily -for testing. -""" - -from __future__ import absolute_import - -from . import run - -__version__ = "0.1.0" diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/__main__.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/__main__.py deleted file mode 100644 index 143cf09bc..000000000 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/__main__.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- -"""Call the function run_module when executed. - -This file indicates that calling the module (`python -m delphi_combo_cases_and_deaths`) will -call the function `run_module` found within the run.py file. There should be -no need to change this template. -""" -from delphi_utils import read_params -from .run import run_module # pragma: no cover - -run_module(read_params()) # pragma: no cover diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/constants.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/constants.py deleted file mode 100644 index e1d5724b4..000000000 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/constants.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Registry for signal names, geo types and other constants.""" -METRICS = [ - "confirmed", - "deaths", -] -SMOOTH_TYPES = [ - "", - "7dav", -] -SENSORS = [ - "incidence_num", - "cumulative_num", - "incidence_prop", - "cumulative_prop", -] -GEO_RESOLUTIONS = [ - "county", - "state", - "msa", - "hrr", - "hhs", - "nation" -] diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py deleted file mode 100755 index d2d1229a9..000000000 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py +++ /dev/null @@ -1,353 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions to call when running the function. - -This module should contain a function called `run_module`, that is executed when -the module is run with `python -m delphi_combo_cases_and_deaths`. -This module produces a combined signal for jhu-csse and usa-facts. This signal -is only used for visualization. It sources Puerto Rico from jhu-csse and -everything else from usa-facts. -""" -from datetime import date, timedelta, datetime -from itertools import product -import re -import time - -import covidcast -import pandas as pd - -from delphi_utils import add_prefix, get_structured_logger -from delphi_utils.geomap import GeoMapper -from .constants import METRICS, SMOOTH_TYPES, SENSORS, GEO_RESOLUTIONS - - -GMPR = GeoMapper() - -COLUMN_MAPPING = {"time_value": "timestamp", - "geo_value": "geo_id", - "value": "val", - "stderr": "se", - "sample_size": "sample_size"} - -EMPTY_FRAME = pd.DataFrame({}, columns=COLUMN_MAPPING.values()) - -covidcast.covidcast._ASYNC_CALL = True # pylint: disable=protected-access - - -def maybe_append(usa_facts, jhu): - """ - Append dataframes if available, otherwise return USAFacts. - - If both data frames are available, append them and return. - - If only USAFacts is available, return it. - - If USAFacts is not available, return None. - """ - if usa_facts is None: - return None - if jhu is None: - return usa_facts - return usa_facts.append(jhu) - - -def compute_special_geo_dfs(df, signal, geo): - """Compute the signal values for special geos (HHS and nation). - - For `num` signals, just replace the geocode to the appropriate resolution. - For `prop` signals, replace the geocode and then compute the proportion using the total - population of the us. - - Parameters - ---------- - df: DataFrame - Dataframe with num values at the county level. - signal: str - Signal name, should end with 'num' or 'prop'. - geo: str - Geo level to compute. - Returns - ------- - DataFrame mapped to the 'geo' level with the correct signal values computed. - """ - df = GMPR.replace_geocode(df, - from_col="geo_id", - from_code="fips", - new_code="state_code") - df = GMPR.add_population_column(df, "state_code") # use total state population - df = GMPR.replace_geocode(df, from_code="state_code", new_code=geo) - if signal.endswith("_prop"): - df["val"] = df["val"]/df["population"] * 100000 - df.drop("population", axis=1, inplace=True) - df.rename({geo: "geo_id"}, axis=1, inplace=True) - return df - - -def merge_dfs_by_geos(usafacts_df, jhu_df, geo): - """Combine the queried usafacts and jhu dataframes based on the geo type.""" - # State level - if geo == 'state': - combined_df = maybe_append( - usafacts_df, - jhu_df if jhu_df is None else jhu_df[jhu_df["geo_value"] == 'pr']) # add territories - # County level - elif geo == 'county': - combined_df = maybe_append( - usafacts_df, - jhu_df if jhu_df is None else jhu_df[jhu_df["geo_value"].str.startswith("72")]) - # For MSA and HRR level, they are the same - elif geo == 'msa': - df = GMPR.get_crosswalk("fips", "msa") - puerto_rico_mask = df["fips"].str.startswith("72") - puerto_rico_msas = df[puerto_rico_mask]["msa"].unique() - combined_df = maybe_append( - usafacts_df, - jhu_df if jhu_df is None else jhu_df[jhu_df["geo_value"].isin(puerto_rico_msas)]) - else: - combined_df = usafacts_df - combined_df.rename(COLUMN_MAPPING, axis=1, inplace=True) - - return combined_df - - -def get_updated_dates(signal, geo, date_range, issue_range=None, fetcher=covidcast.signal): - """Return the unique dates of the values that were updated in a given issue range in a geo.""" - usafacts_df = fetcher( - "usa-facts", signal, - date_range[0], date_range[1], - geo, - issues=issue_range - ) - jhu_df = fetcher( - "jhu-csse", signal, - date_range[0], date_range[1], - geo, - issues=issue_range - ) - - if usafacts_df is None: - return None - - merged_df = merge_dfs_by_geos(usafacts_df, jhu_df, geo) - timestamp_mask = merged_df["timestamp"]<=usafacts_df["timestamp"].max() - unique_dates = merged_df.loc[timestamp_mask]["timestamp"].unique() - return unique_dates - - -def combine_usafacts_and_jhu(signal, geo, date_range, logger, - issue_range=None, fetcher=covidcast.signal): - """Add rows for PR from JHU signals to USA-FACTS signals. - - For hhs and nation, fetch the county `num` data so we can compute the proportions correctly - and after combining JHU and USAFacts and mapping to the desired geos. - """ - is_special_geo = geo in ["hhs", "nation"] - geo_to_fetch = "county" if is_special_geo else geo - signal_to_fetch = signal.replace("_prop", "_num") if is_special_geo else signal - - unique_dates = get_updated_dates( - signal_to_fetch, geo_to_fetch, date_range, issue_range, fetcher - ) - - # This occurs if the usafacts ~and the jhu query were empty - if unique_dates is None: - logger.info("USA-FACTS completely unavailable for dates", date_range=date_range) - return EMPTY_FRAME - - # Query only the represented window so that every geo is represented; a single window call is - # faster than a fetch for every date in unique_dates even in cases of 1:10 sparsity, - # i.e., len(unique_dates):len(max(unique_dates) - min(unique_dates)) - query_min, query_max = unique_dates.min(), unique_dates.max() - usafacts_df = fetcher( - "usa-facts", signal_to_fetch, - query_min, query_max, - geo_to_fetch, - ) - jhu_df = fetcher( - "jhu-csse", signal_to_fetch, - query_min, query_max, - geo_to_fetch, - ) - combined_df = merge_dfs_by_geos(usafacts_df, jhu_df, geo_to_fetch) - - # default sort from API is ORDER BY signal, time_value, geo_value, issue - # we want to drop all but the most recent (last) issue - combined_df.drop_duplicates( - subset=["geo_id", "timestamp"], - keep="last", - inplace=True - ) - - if is_special_geo: - combined_df = compute_special_geo_dfs(combined_df, signal, geo) - if "se" not in combined_df.columns and "sample_size" not in combined_df.columns: - # if a column has non numeric data including None, they'll be dropped. - # se and sample size are required later so we add them back. - combined_df["se"] = combined_df["sample_size"] = None - combined_df.rename({geo: "geo_id"}, axis=1, inplace=True) - - return combined_df - -def extend_raw_date_range(params, sensor_name): - """Extend the date range of the raw data backwards by 7 days. - - A complete issue includes smoothed signals as well as all raw data - that contributed to the smoothed values, so that it's possible to use - the raw values in the API to reconstruct the smoothed signal at will. - The smoother we're currently using incorporates the previous 7 - days of data, so we must extend the date range of the raw data - backwards by 7 days. - """ - if sensor_name.find("7dav") < 0: - return [ - params['indicator']['date_range'][0] - timedelta(days=7), - params['indicator']['date_range'][-1] - ] - return params['indicator']['date_range'] - -def next_missing_day(source, signals): - """Fetch the first day for which we want to generate new data.""" - meta_df = covidcast.metadata() - meta_df = meta_df[meta_df["data_source"] == source] - meta_df = meta_df[meta_df["signal"].isin(signals)] - # min: use the max_time of the most lagged signal, in case they differ - # +timedelta: the subsequent day is the first day of new data to generate - day = min(meta_df["max_time"]) + timedelta(days=1) - return day - -def sensor_signal(metric, sensor, smoother): - """Generate the signal name for a particular configuration.""" - if smoother == "7dav": - sensor_name = "_".join([smoother, sensor]) - else: - sensor_name = sensor - return sensor_name, "_".join([metric, sensor_name]) - -def configure(variants, params): - """Validate params file and set date range.""" - params['indicator']['export_start_date'] = date(*params['indicator']['export_start_date']) - yesterday = date.today() - timedelta(days=1) - next_day = next_missing_day( - params['indicator']["source"], - set(signal[-1] for signal in variants) - ) - configure_range(params, 'date_range', yesterday, next_day) - # pad issue range in case we caught jhu but not usafacts or v/v in the last N issues; - # issue_days also needs to be set to a value large enough to include values you would like - # to reissue - try: - issue_days = params['indicator']['issue_days'] - except KeyError: - issue_days = 7 - configure_range(params, 'issue_range', yesterday, next_day - timedelta(days=issue_days)) - return params - -def configure_range(params, range_param, yesterday, next_day): - """Configure a parameter which stores a range of dates. - - May be specified in params.json as: - "new" - set to [next_day, yesterday] - "all" - set to [export_start_date, yesterday] - yyyymmdd-yyyymmdd - set to exact range - """ - if range_param not in params['indicator'] or params['indicator'][range_param] == 'new': - # only create combined file for the newest update - # (usually for yesterday, but check just in case) - params['indicator'][range_param] = [ - min( - yesterday, - next_day - ), - yesterday - ] - elif params['indicator'][range_param] == 'all': - # create combined files for all of the historical reports - if range_param == 'date_range': - params['indicator'][range_param] = [params['indicator']['export_start_date'], yesterday] - elif range_param == 'issue_range': - # for issue_range=all we want the latest issue for all requested - # dates, aka the default when issue is unspecified - params['indicator'][range_param] = None - else: - raise ValueError( - f"Bad Programmer: Invalid range_param '{range_param}';" - f"expected 'date_range' or 'issue_range'") - else: - match_res = re.findall(re.compile(r'^\d{8}-\d{8}$'), params['indicator'][range_param]) - if len(match_res) == 0: - raise ValueError( - f"Invalid {range_param} parameter. Try (new, all, yyyymmdd-yyyymmdd).") - try: - date1 = datetime.strptime(params['indicator'][range_param][:8], '%Y%m%d').date() - except ValueError as error: - raise ValueError( - f"Invalid {range_param} parameter. Please check the first date.") from error - try: - date2 = datetime.strptime(params['indicator'][range_param][-8:], '%Y%m%d').date() - except ValueError as error: - raise ValueError( - f"Invalid {range_param} parameter. Please check the second date.") from error - - # ensure valid start date - if date1 < params['indicator']['export_start_date']: - date1 = params['indicator']['export_start_date'] - params['indicator'][range_param] = [date1, date2] - -def run_module(params): - """ - Produce a combined cases and deaths signal using data from JHU and USA Facts. - - Parameters - ---------- - params - Dictionary containing indicator configuration. Expected to have the following structure: - - "common": - - "export_dir": str, directory to write output. - - "log_exceptions" (optional): bool, whether to log exceptions to file. - - "log_filename" (optional): str, name of file to write logs - - "indicator": - - "export_start_date": list of ints, [year, month, day] format, first day to begin - data exports from. - - "date_range": str, YYYYMMDD-YYYYMMDD format, range of dates to generate data for. - - "source": str, name of combo indicator in metadata. - - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix. - """ - start_time = time.time() - variants = [tuple((metric, geo_res)+sensor_signal(metric, sensor, smoother)) - for (metric, geo_res, sensor, smoother) in - product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES)] - variants = [i for i in variants if not ("7dav" in i[2] and "cumulative" in i[2])] - params = configure(variants, params) - logger = get_structured_logger( - __name__, filename=params["common"].get("log_filename"), - log_exceptions=params["common"].get("log_exceptions", True)) - - for metric, geo_res, sensor_name, signal in variants: - logger.info("Generating signal and exporting to CSV", - geo_res = geo_res, - metric = metric, - sensor = sensor_name, - signal = signal) - df = combine_usafacts_and_jhu(signal, - geo_res, - extend_raw_date_range(params, sensor_name), - logger, - params['indicator']['issue_range']) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - start_date = pd.to_datetime(params['indicator']['export_start_date']) - export_dir = params["common"]["export_dir"] - dates = pd.Series( - df[df["timestamp"] >= start_date]["timestamp"].unique() - ).sort_values() - - signal_name = add_prefix([signal], - wip_signal=params['indicator']["wip_signal"], - prefix="wip_") - for date_ in dates: - export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv' - df[df["timestamp"] == date_][["geo_id", "val", "se", "sample_size", ]].to_csv( - f"{export_dir}/{export_fn}", index=False, na_rep="NA" - ) - - elapsed_time_in_seconds = round(time.time() - start_time, 2) - logger.info("Completed indicator run", - elapsed_time_in_seconds = elapsed_time_in_seconds) diff --git a/combo_cases_and_deaths/params.json.template b/combo_cases_and_deaths/params.json.template deleted file mode 100644 index 9c009e0d5..000000000 --- a/combo_cases_and_deaths/params.json.template +++ /dev/null @@ -1,33 +0,0 @@ -{ - "common": { - "log_exceptions": false, - "export_dir": "./receiving", - "log_filename": "./indicator-combination.log" - }, - "indicator": { - "export_start_date":[2020,4,1], - "date_range":"new", - "issue_days":7, - "source":"indicator-combination", - "wip_signal": "" - }, - "validation": { - "common": { - "data_source": "indicator-combination", - "span_length": 14, - "min_expected_lag": {"all": "2"}, - "max_expected_lag": {"all": "6"}, - "dry_run": true, - "suppressed_errors": [{"check_name": "check_val_lt_0"} ] - }, - "static": { - "minimum_sample_size": 5, - "missing_se_allowed": true, - "missing_sample_size_allowed": true - }, - "dynamic": { - "ref_window_size": 7 - } - } -} - diff --git a/combo_cases_and_deaths/receiving/.gitignore b/combo_cases_and_deaths/receiving/.gitignore deleted file mode 100644 index afed0735d..000000000 --- a/combo_cases_and_deaths/receiving/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.csv diff --git a/combo_cases_and_deaths/setup.py b/combo_cases_and_deaths/setup.py deleted file mode 100644 index db97840a7..000000000 --- a/combo_cases_and_deaths/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup -from setuptools import find_packages - -required = [ - "pandas", - "pydocstyle", - "pytest", - "pytest-cov", - "pylint==2.8.3", - "delphi-utils", - "covidcast>=0.1.4" -] - -setup( - name="delphi_combo_cases_and_deaths", - version="0.1.0", - description="A combined signal for cases and deaths using JHU for Puerto Rico and USA Facts everywhere else", - author="Jingjing Tang, Kathryn Mazaitis", - author_email="krivard@cs.cmu.edu", - url="https://github.com/cmu-delphi/covidcast-indicators", - install_requires=required, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Programming Language :: Python :: 3.8", - ], - packages=find_packages(), -) diff --git a/combo_cases_and_deaths/static/.gitignore b/combo_cases_and_deaths/static/.gitignore deleted file mode 100644 index e69de29bb..000000000 diff --git a/combo_cases_and_deaths/tests/receiving/.gitkeep b/combo_cases_and_deaths/tests/receiving/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/combo_cases_and_deaths/tests/test_run.py b/combo_cases_and_deaths/tests/test_run.py deleted file mode 100644 index c799b6ed3..000000000 --- a/combo_cases_and_deaths/tests/test_run.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Tests for running combo cases and deaths indicator.""" -import logging -from datetime import date -from itertools import product -import os -import unittest -from unittest.mock import patch, call -import pandas as pd -import numpy as np - -from delphi_combo_cases_and_deaths.run import ( - run_module, - extend_raw_date_range, - get_updated_dates, - sensor_signal, - combine_usafacts_and_jhu, - compute_special_geo_dfs, - COLUMN_MAPPING) -from delphi_combo_cases_and_deaths.constants import METRICS, SMOOTH_TYPES, SENSORS -from delphi_utils.geomap import GeoMapper - -TEST_LOGGER = logging.getLogger() - -def test_issue_dates(): - """The smoothed value for a particular date is computed from the raw - values for a span of dates. We want users to be able to see in the - API all the raw values that went into the smoothed computation, - for transparency and peer review. This means that each issue - should contain more days of raw data than smoothed data. - """ - reference_dr = [date.today(), date.today()] - params = {'indicator': {'date_range': reference_dr}} - n_changed = 0 - variants = [sensor_signal(metric, sensor, smoother) for - metric, sensor, smoother in - product(METRICS, SENSORS, SMOOTH_TYPES)] - variants_changed = [] - for sensor_name, _ in variants: - dr = extend_raw_date_range(params, sensor_name) - if dr[0] != reference_dr[0]: - n_changed += 1 - variants_changed.append(sensor_name) - assert n_changed == len(variants) / 2, f""" -Raw variants should post more days than smoothed. -All variants: {variants} -Date-extended variants: {variants_changed} -""" - -@patch("covidcast.covidcast.signal") -def test_unstable_sources(mock_covidcast_signal): - """Verify that combine_usafacts_and_jhu assembles the combined data - frame correctly for all cases where 0, 1, or both signals are - available. - """ - date_count = [1] - def jhu(geo, c=date_count): - if geo == "state": - geo_val = "pr" - elif geo == "msa": - geo_val = "38660" - else: - geo_val = "72001" - return pd.DataFrame( - [(date.fromordinal(c[0]),geo_val,1,1,1)], - columns="timestamp geo_value value stderr sample_size".split()) - def uf(geo, c=date_count): - if geo == "state": - geo_val = "ny" - elif geo == "msa": - geo_val = "10580" - else: - geo_val = "36001" - return pd.DataFrame( - [(date.fromordinal(c[0]),geo_val,1,1,1)], - columns="timestamp geo_value value stderr sample_size".split()) - def make_mock(geo): - # The first two in each row provide a unique_date array of the appropriate length for - # query of the latter two (in combine_usafacts_and_jhu) - return [ - # 1 0 - uf(geo), None, uf(geo), None, - # 0 1 - None, jhu(geo), - # 1 1 - uf(geo), jhu(geo), uf(geo), jhu(geo), - # 0 0 - None, None - ] - - geos = ["state", "county", "msa", "nation", "hhs"] - outputs = [df for g in geos for df in make_mock(g)] - mock_covidcast_signal.side_effect = outputs[:] - - date_range = [date.today(), date.today()] - - calls = 0 - for geo in geos: - for config, call_size, expected_size in [ - ("1 0", 4, 1), - ("0 1", 2, 0), - ("1 1", 4, 1 if geo in ["nation", "hhs"] else 2), - ("0 0", 2, 0) - ]: - df = combine_usafacts_and_jhu("", geo, date_range, TEST_LOGGER, fetcher=mock_covidcast_signal) - assert df.size == expected_size * len(COLUMN_MAPPING), f""" -Wrong number of rows in combined data frame for the number of available signals. - -input for {geo} {config}: -{outputs[calls]} -{outputs[calls + 1]} - -output: -{df} - -expected rows: {expected_size} -""" - calls += call_size - date_count[0] += 1 - -@patch("covidcast.covidcast.signal") -def test_multiple_issues(mock_covidcast_signal): - """Verify that only the most recent issue is retained.""" - mock_covidcast_signal.side_effect = [ - pd.DataFrame({ - "geo_value": ["01000", "01000"], - "value": [1, 10], - "timestamp": [20200101, 20200101], - "issue": [20200102, 20200104] - }), - None - ] * 2 - result = combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal) - pd.testing.assert_frame_equal( - result, - pd.DataFrame( - { - "geo_id": ["01000"], - "val": [10], - "timestamp": [20200101], - "issue": [20200104] - }, - index=[1] - ) - ) - -def test_compute_special_geo_dfs(): - test_df = pd.DataFrame({"geo_id": ["01000", "01001"], - "val": [50, 100], - "timestamp": [20200101, 20200101]},) - df = compute_special_geo_dfs(test_df, "_prop", "nation") - state_pop = GeoMapper().get_crosswalk("state_code", "pop") - state_pop = int(state_pop.loc[state_pop.state_code == "01", "pop"]) - expected_df = pd.DataFrame({ - "timestamp": [20200101], - "geo_id": ["us"], - "val": [150/state_pop*100000] - }) - pd.testing.assert_frame_equal(df, expected_df) - pd.testing.assert_frame_equal( - compute_special_geo_dfs(test_df, "_num", "nation"), - pd.DataFrame({"timestamp": [20200101], - "geo_id": ["us"], - "val": [150]}) - ) - -@patch("covidcast.covidcast.signal") -def test_get_updated_dates(mock_covidcast_signal): - mock_covidcast_signal.side_effect = [ - pd.DataFrame({"geo_value": ["01000", "01001"], - "value": [50, 100], - "timestamp": [20200101, 20200103]}), - pd.DataFrame({"geo_value": ["72001", "01001"], - "value": [200, 100], - "timestamp": [20200101, 20200101]}) - ] - updated_dates = get_updated_dates( - "confirmed_incidence_num", - "nation", - date_range=(0, 1), - fetcher=mock_covidcast_signal) - assert np.allclose(updated_dates, np.array([20200101, 20200103])) - -@patch("covidcast.covidcast.signal") -def test_combine_usafacts_and_jhu_special_geos(mock_covidcast_signal): - mock_covidcast_signal.side_effect = [ - pd.DataFrame({"geo_value": ["01000", "01001"], - "value": [50, 100], - "timestamp": [20200101, 20200101]}), - pd.DataFrame({"geo_value": ["72001", "01001"], - "value": [200, 100], - "timestamp": [20200101, 20200101]}), - ] * 6 # each call to combine_usafacts_and_jhu makes (2 + 2 * len(unique_timestamps)) = 12 calls to the fetcher - - pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_num", "nation", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal), - pd.DataFrame({"timestamp": [20200101], - "geo_id": ["us"], - "val": [50 + 100 + 200], - "se": [None], - "sample_size": [None]}) - ) - df = combine_usafacts_and_jhu("confirmed_incidence_prop", "nation", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal) - state_pop = GeoMapper().get_crosswalk("state_code", "pop") - state_pop = int(state_pop.loc[state_pop.state_code.isin(["01", "72"]), "pop"].sum()) - expected_df = pd.DataFrame({ - "timestamp": [20200101], - "geo_id": ["us"], - "val": [(50 + 100 + 200) / state_pop * 100000], - "se": [None], - "sample_size": [None] - }) - pd.testing.assert_frame_equal(df, expected_df) - pd.testing.assert_frame_equal( - combine_usafacts_and_jhu("confirmed_incidence_num", "county", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal), - pd.DataFrame({"geo_id": ["01000", "01001", "72001"], - "val": [50, 100, 200], - "timestamp": [20200101, 20200101, 20200101]}, - index=[0, 1, 0]) - ) - -@patch("covidcast.covidcast.signal") -def test_no_nation_jhu(mock_covidcast_signal): - """ - If we get JHU data that extends farther into the future than USAFacts data, trim it off. - """ - cvc_columns = "time_value geo_value value stderr sample_size".split() - mock_covidcast_signal.side_effect = [ - pd.DataFrame({"geo_value": ["01000"], - "value": [50], - "timestamp": [20200101]},), - pd.DataFrame({"geo_value": ["72001", "72001"], - "value": [1, 1], - "timestamp": [20200101, 20200102]}), - pd.DataFrame({"geo_value": ["01000"], - "value": [50], - "timestamp": [20200101]},), - pd.DataFrame({"geo_value": ["72001"], - "value": [1], - "timestamp": [20200101]}) - ] - result = combine_usafacts_and_jhu("_num", "nation", date_range=(0, 1), logger=TEST_LOGGER, fetcher=mock_covidcast_signal) - - assert mock_covidcast_signal.call_args_list[-1] == call( - "jhu-csse", - "_num", - 20200101, - 20200101, - "county" - ) - pd.testing.assert_frame_equal( - result, - pd.DataFrame({"timestamp":[20200101], - "geo_id":["us"], - "val":[51], - "se": [None], - "sample_size": [None]},) - ) - -@patch("delphi_combo_cases_and_deaths.run.combine_usafacts_and_jhu") -def test_output_files(mock_combine): - params = { - "common": { - "export_dir": "./receiving" - }, - "indicator": { - "export_start_date": [2020, 4, 1], - "source":"indicator-combination", - "wip_signal": "" - } - } - mock_combine.return_value = pd.DataFrame( - { - "geo_id": ["01000"], - "val": [10], - "timestamp": [pd.to_datetime("2021-01-04")], - "issue": [pd.to_datetime("2021-01-04")], - "se": 0, - "sample_size": 0 - }, - index=[1] - ) - run_module(params) - csv_files = [f for f in os.listdir("receiving") if f.endswith(".csv")] - dates = ["20210104"] - geos = ["county", "hrr", "msa", "state", "hhs", "nation"] - - # enumerate metric names. - metrics = [] - for event, span, stat in product(["deaths", "confirmed"], - ["cumulative", "incidence"], - ["num", "prop"]): - metrics.append("_".join([event, span, stat])) - metrics.append("_".join([event, "7dav", span, stat])) - - expected_files = [] - for date in dates: - for geo in geos: - for metric in metrics: - if "7dav" in metric and "cumulative" in metric: - continue - expected_files += [date + "_" + geo + "_" + metric + ".csv"] - assert set(csv_files) == set(expected_files) - -if __name__ == '__main__': - unittest.main()