cmu-delphi · Ananya-Joshi · Sep 8, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 9, 2021
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -16,7 +16,7 @@ jobs:
     if: github.event.pull_request.draft == false
     strategy:
       matrix:
-        packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, covid_act_now, doctor_visits, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts]
+        packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, covid_act_now, doctor_visits, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts, cdc_vaccines]
     defaults:
       run:
         working-directory: ${{ matrix.packages }}

diff --git a/cdc_vaccines/.pylintrc b/cdc_vaccines/.pylintrc
@@ -0,0 +1,22 @@
+
+[MESSAGES CONTROL]
+
+disable=logging-format-interpolation,
+    too-many-locals,
+    too-many-arguments,
+    # Allow pytest functions to be part of a class.
+    no-self-use,
+    # Allow pytest classes to have one test.
+    too-few-public-methods
+
+[BASIC]
+
+# Allow arbitrarily short-named variables.
+variable-rgx=[a-z_][a-z0-9_]*
+argument-rgx=[a-z_][a-z0-9_]*
+attr-rgx=[a-z_][a-z0-9_]*
+
+[DESIGN]
+
+# Don't complain about pytest "unused" arguments.
+ignored-argument-names=(_.*|run_as_module)
diff --git a/cdc_vaccines/Makefile b/cdc_vaccines/Makefile
@@ -0,0 +1,29 @@
+.PHONY = venv, lint, test, clean
+
+dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*')
+
+venv:
+	python3.8 -m venv env
+
+install: venv
+	. env/bin/activate; \
+	pip install wheel ; \
+	pip install -e ../_delphi_utils_python ;\
+	pip install -e .
+
+lint:
+	. env/bin/activate; pylint $(dir)
+	. env/bin/activate; pydocstyle $(dir)
+
+test:
+	. env/bin/activate ;\
+	(cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing)
+
+clean:
+	rm -rf env
+	rm -f params.json
+
+run: 
+	env/bin/python -m $(dir)
+	env/bin/python -m delphi_utils.validator --dry_run
+	env/bin/python -m delphi_utils.archive
diff --git a/cdc_vaccines/README.md b/cdc_vaccines/README.md
@@ -0,0 +1,69 @@
+# CDC Vaccinations
+
+This indicator provides the official vaccination counts in the US. We export the county-level 
+daily vaccination rates data as-is, and publish the result as a COVIDcast signal. 
+We also aggregate the data to the MSA, HRR, State, HHS Region, and Nation levels. 
+For detailed information see the files DETAILS.md contained in this directory.
+
+Note that individuals could be vaccinated outside of the US. Additionally, 
+there is no county level data for counties in Texas and Hawaii. Each state has some vaccination counts assigned to "unknown county". Some vaccination counts are assigned to "unknown state, unknown county".
+
+
+## Running the Indicator
+
+The indicator is run by directly executing the Python module contained in this
+directory. The safest way to do this is to create a virtual environment,
+installed the common DELPHI tools, and then install the module and its
+dependencies. To do this, run the following command from this directory:
+
+```
+make install
+```
+
+This command will install the package in editable mode, so you can make changes that
+will automatically propagate to the installed package. 
+
+All of the user-changable parameters are stored in `params.json`. To execute
+the module and produce the output datasets (by default, in `receiving`), run
+the following:
+
+```
+env/bin/python -m delphi_cdc_vaccines
+```
+
+If you want to enter the virtual environment in your shell, 
+you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. 
+
+Once you are finished, you can remove the virtual environment and 
+params file with the following:
+
+```
+make clean
+```
+
+## Testing the code
+
+To run static tests of the code style, run the following command:
+
+```
+make lint
+```
+
+Unit tests are also included in the module. To execute these, run the following
+command from this directory:
+
+```
+make test
+```
+
+To run individual tests, run the following:
+
+```
+(cd tests && ../env/bin/pytest test_run.py --cov=delphi_ --cov-report=term-missing)
+```
+
+The output will show the number of unit tests that passed and failed, along
+with the percentage of code covered by the tests. 
+
+None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and
+should not include critical sub-routines. 
diff --git a/cdc_vaccines/REVIEW.md b/cdc_vaccines/REVIEW.md
@@ -0,0 +1,38 @@
+## Code Review (Python)
+
+A code review of this module should include a careful look at the code and the
+output. To assist in the process, but certainly not in replace of it, please
+check the following items.
+
+**Documentation**
+
+- [ ] the README.md file template is filled out and currently accurate; it is
+possible to load and test the code using only the instructions given
+- [ ] minimal docstrings (one line describing what the function does) are
+included for all functions; full docstrings describing the inputs and expected
+outputs should be given for non-trivial functions
+
+**Structure**
+
+- [ ] code should pass lint checks (`make lint`)
+- [ ] any required metadata files are checked into the repository and placed
+within the directory `static`
+- [ ] any intermediate files that are created and stored by the module should
+be placed in the directory `cache`
+- [ ] final expected output files to be uploaded to the API are placed in the
+`receiving` directory; output files should not be committed to the respository
+- [ ] all options and API keys are passed through the file `params.json`
+- [ ] template parameter file (`params.json.template`) is checked into the
+code; no personal (i.e., usernames) or private (i.e., API keys) information is
+included in this template file
+
+**Testing**
+
+- [ ] module can be installed in a new virtual environment (`make install`)
+- [ ] reasonably high level of unit test coverage covering all of the main logic
+of the code (e.g., missing coverage for raised errors that do not currently seem
+possible to reach are okay; missing coverage for options that will be needed are
+not)
+- [ ] all unit tests run without errors (`make test`)
+- [ ] indicator directory has been added to GitHub CI
+(`covidcast-indicators/.github/workflows/python-ci.yml`)
diff --git a/cdc_vaccines/cache/.gitignore b/cdc_vaccines/cache/.gitignore
diff --git a/cdc_vaccines/delphi_cdc_vaccines/__init__.py b/cdc_vaccines/delphi_cdc_vaccines/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+"""Module to pull and clean indicators from the CDC source.
+
+This file defines the functions that are made public by the module. As the
+module is intended to be executed though the main method, these are primarily
+for testing.
+"""
+
+from __future__ import absolute_import
+from . import pull
+from . import run
+
+__version__ = "0.1.0"
diff --git a/cdc_vaccines/delphi_cdc_vaccines/__main__.py b/cdc_vaccines/delphi_cdc_vaccines/__main__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""Call the function run_module when executed.
+
+This file indicates that calling the module (`python -m delphi_cdc_vaccines`) will
+call the function `run_module` found within the run.py file. There should be
+no need to change this template.
+"""
+
+from delphi_utils import read_params
+from .run import run_module  # pragma: no cover
+
+run_module(read_params())  # pragma: no cover
diff --git a/cdc_vaccines/delphi_cdc_vaccines/constants.py b/cdc_vaccines/delphi_cdc_vaccines/constants.py
@@ -0,0 +1,33 @@
+"""Registry for variations."""
+
+from itertools import product
+from delphi_utils import Smoother
+
+
+CUMULATIVE = 'cumulative'
+INCIDENCE ='incidence'
+FREQUENCY = [CUMULATIVE, INCIDENCE]
+STATUS = ["tot", "part"]
+AGE = ["", "_12P", "_18P", "_65P"]
+
+SIGNALS = [f"{frequency}_counts_{status}_vaccine{AGE}" for
+	frequency, status, age in product(FREQUENCY, STATUS, AGE)]
+DIFFERENCE_MAPPING = {
+    f"{INCIDENCE}_counts_{status}_vaccine{age}": f"{CUMULATIVE}_counts_{status}_vaccine{age}"
+    for status, age in product(STATUS, AGE)
+}
+SIGNALS = list(DIFFERENCE_MAPPING.keys()) + list(DIFFERENCE_MAPPING.values())
+
+
+GEOS = [
+    "nation",
+    "state",
+    "hrr",
+    "hhs",
+    "msa"
+]
+
+SMOOTHERS = [
+    (Smoother("identity", impute_method=None), ""),
+    (Smoother("moving_average", window_length=7), "_7dav"),
+]
diff --git a/cdc_vaccines/delphi_cdc_vaccines/pull.py b/cdc_vaccines/delphi_cdc_vaccines/pull.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+"""Functions for pulling data from the CDC data website for vaccines."""
+import hashlib
+from logging import Logger
+from delphi_utils.geomap import GeoMapper
+import numpy as np
+import pandas as pd
+from .constants import SIGNALS, DIFFERENCE_MAPPING
+
+
+
+def pull_cdcvacc_data(base_url: str, logger: Logger) -> pd.DataFrame:
+    """Pull the latest data from the CDC on vaccines and conform it into a dataset.
+
+    The output dataset has:
+    - Each row corresponds to (County, Date), denoted (FIPS, timestamp)
+    - Each row additionally has columns that correspond to the counts or
+      cumulative counts of vaccination status (fully vaccinated,
+      partially vaccinated) of various age groups (all, 12+, 18+, 65+)
+      from December 13th 2020 until the latest date
+
+    Note that the raw dataset gives the `cumulative` metrics, from which
+    we compute `counts` by taking first differences.  Hence, `counts`
+    may be negative.  This is wholly dependent on the quality of the raw
+    dataset.
+
+    We filter the data such that we only keep rows with valid FIPS, or "FIPS"
+    codes defined under the exceptions of the README.  The current  exceptions
+    include:
+    # - 0: statewise unallocated
+    Parameters
+    ----------
+    base_url: str
+        Base URL for pulling the CDC Vaccination Data
+    logger: Logger
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe as described above.
+    """
+    # Columns to drop the the data frame.
+    drop_columns = [
+    "date",
+    "recip_state",
+    "series_complete_pop_pct",
+    "mmwr_week",
+    "recip_county",
+    "state_id"
+    ]
+
+
+    # Read data
+    df = pd.read_csv(base_url)
+    logger.info("data retrieved from source",
+                num_rows=df.shape[0],
+                num_cols=df.shape[1],
+                min_date=min(df['Date']),
+                max_date=max(df['Date']),
+                checksum=hashlib.sha256(pd.util.hash_pandas_object(df).values).hexdigest())
+    df.columns = [i.lower() for i in df.columns]
+
+    df['recip_state'] = df['recip_state'].str.lower()
+    drop_columns.extend([x for x in df.columns if ("pct" in x) | ("svi" in x)])
+    drop_columns =  list(set(drop_columns))
+    df = GeoMapper().add_geocode(df, "state_id", "state_code",
+        from_col="recip_state", new_col="state_id", dropna=False)
+    df['state_id'] = df['state_id'].fillna('0').astype(int)
+    # Change FIPS from 0 to XX000 for statewise unallocated cases/deaths
+    unassigned_index = (df["fips"] == "UNK")
+    df.loc[unassigned_index, "fips"] = df["state_id"].loc[unassigned_index].values * 1000
+
+    # Conform FIPS
+    df["fips"] = df["fips"].apply(lambda x: f"{int(x):05d}")
+    df["timestamp"] = pd.to_datetime(df["date"])
+    # Drop unnecessary columns (state is pre-encoded in fips)
+    try:
+        df.drop(drop_columns, axis=1, inplace=True)
+    except KeyError as e:
+        raise ValueError(
+            "Tried to drop non-existent columns. The dataset "
+            "schema may have changed.  Please investigate and "
+            "amend drop_columns."
+        ) from e
+    # timestamp: str -> datetime
+    df.columns = ["fips",
+                  "cumulative_counts_tot_vaccine",
+                  "cumulative_counts_tot_vaccine_12P",
+                  "cumulative_counts_tot_vaccine_18P",
+                  "cumulative_counts_tot_vaccine_65P",
+                  "cumulative_counts_part_vaccine",
+                  "cumulative_counts_part_vaccine_12P",
+                  "cumulative_counts_part_vaccine_18P",
+                  "cumulative_counts_part_vaccine_65P",
+                  "timestamp"]
+    df_dummy = df.loc[(df["fips"]!='00000') & (df["timestamp"] == min(df["timestamp"]))].copy()
+    #handle fips 00000 separately
+    df_oth = df.loc[((df["fips"]=='00000') &
+        (df["timestamp"]==min(df[df['fips'] == '00000']['timestamp'])))].copy()
+    df_dummy = pd.concat([df_dummy, df_oth])
+    df_dummy.loc[:, "timestamp"] = df_dummy.loc[:, "timestamp"] - pd.Timedelta(days=1)
+    df_dummy.loc[:, ["cumulative_counts_tot_vaccine",
+                    "cumulative_counts_tot_vaccine_12P",
+                    "cumulative_counts_tot_vaccine_18P",
+                    "cumulative_counts_tot_vaccine_65P",
+                    "cumulative_counts_part_vaccine",
+                    "cumulative_counts_part_vaccine_12P",
+                    "cumulative_counts_part_vaccine_18P",
+                    "cumulative_counts_part_vaccine_65P",
+                    ]] = 0
+
+    df =pd.concat([df_dummy, df])
+    # Obtain new_counts
+    df.sort_values(["fips", "timestamp"], inplace=True)
+    for to, from_d in DIFFERENCE_MAPPING.items():
+        df[to] = df[from_d].diff()
+
+    rem_list = [ x for x in list(df.columns) if x not in ['timestamp', 'fips'] ]
+    # Handle edge cases where we diffed across fips
+    mask = df["fips"] != df["fips"].shift(1)
+    df.loc[mask, rem_list] = np.nan
+    df.reset_index(inplace=True, drop=True)
+    # Final sanity checks
+    unique_days = df["timestamp"].unique()
+    min_timestamp = min(unique_days)
+    max_timestamp = max(unique_days)
+    n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, "D") + 1
+    if n_days != len(unique_days):
+        raise ValueError(
+            f"Not every day between {min_timestamp} and "
+            "{max_timestamp} is represented."
+        )
+    return df.loc[
+        df["timestamp"] >= min(df["timestamp"]),
+        # Reorder
+        ["fips", "timestamp"] + SIGNALS,
+    ].reset_index(drop=True)