cmu-delphi · minhkhul · Jun 10, 2024 · Mar 18, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/nssp/.pylintrc b/nssp/.pylintrc
@@ -0,0 +1,22 @@
+
+[MESSAGES CONTROL]
+
+disable=logging-format-interpolation,
+    too-many-locals,
+    too-many-arguments,
+    # Allow pytest functions to be part of a class.
+    no-self-use,
+    # Allow pytest classes to have one test.
+    too-few-public-methods
+
+[BASIC]
+
+# Allow arbitrarily short-named variables.
+variable-rgx=[a-z_][a-z0-9_]*
+argument-rgx=[a-z_][a-z0-9_]*
+attr-rgx=[a-z_][a-z0-9_]*
+
+[DESIGN]
+
+# Don't complain about pytest "unused" arguments.
+ignored-argument-names=(_.*|run_as_module)
diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md
@@ -0,0 +1,13 @@
+# NSSP data
+
+We import the NSSP Emergency Department Visit data, including percentage and smoothed percentage data, from the CDC website. The data is available in county level, state level and national level.
+
+## Geographical Levels
+* `state`: reported using two-letter postal code
+* `county`: reported using fips code
+* `national`: just `us` for now
+## Metrics
+*  `percent_visits_covid`, `percent_visits_rsv`, `percent_visits_influenza`: percentage of emergency department patient visits for specified pathogen.
+*  `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid.
+*  `smoothed_percent_visits_covid`, `smoothed_percent_visits_rsv`, `smoothed_percent_visits_influenza`: Smoothed percentage of emergency department patient visits for specified pathogen.
+*  `smoothed_percent_visits_combined`: Smoothed sum of the three percentages of visits for flu, rsv and covid.
diff --git a/nssp/Makefile b/nssp/Makefile
@@ -0,0 +1,29 @@
+.PHONY = venv, lint, test, clean
+
+dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*' | head -1)
+venv:
+	python3.8 -m venv env
+
+install: venv
+	. env/bin/activate; \
+	pip install wheel ; \
+	pip install -e ../_delphi_utils_python ;\
+	pip install -e .
+
+install-ci: venv
+	. env/bin/activate; \
+	pip install wheel ; \
+	pip install ../_delphi_utils_python ;\
+	pip install .
+
+lint:
+	. env/bin/activate; pylint $(dir)
+	. env/bin/activate; pydocstyle $(dir)
+
+test:
+	. env/bin/activate ;\
+	(cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing)
+
+clean:
+	rm -rf env
+	rm -f params.json
diff --git a/nssp/README.md b/nssp/README.md
@@ -0,0 +1,75 @@
+# NWSS wastewater data
+
+We import the wastewater data, currently only the smoothed concentration, from the CDC website, aggregate to the state and national level from the wastewater sample site level, and export the aggregated data.
+For details see the `DETAILS.md` file in this directory.
+
+## Create a MyAppToken
+`MyAppToken` is required when fetching data from SODA Consumer API 
+(https://dev.socrata.com/foundry/data.cdc.gov/r8kw-7aab). Follow the 
+steps below to create a MyAppToken.
+- Click the `Sign up for an app token` button in the linked website
+- Sign In or Sign Up with Socrata ID
+- Click the `Create New App Token` button
+- Fill in `Application Name` and `Description` (You can just use delphi_wastewater
+  for both) and click `Save`
+- Copy the `App Token`
+
+
+## Running the Indicator
+
+The indicator is run by directly executing the Python module contained in this
+directory. The safest way to do this is to create a virtual environment,
+installed the common DELPHI tools, and then install the module and its
+dependencies. To do this, run the following command from this directory:
+
+```
+make install
+```
+
+This command will install the package in editable mode, so you can make changes that
+will automatically propagate to the installed package. 
+
+All of the user-changable parameters are stored in `params.json`. To execute
+the module and produce the output datasets (by default, in `receiving`), run
+the following:
+
+```
+env/bin/python -m delphi_nwss
+```
+
+If you want to enter the virtual environment in your shell, 
+you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. 
+
+Once you are finished, you can remove the virtual environment and 
+params file with the following:
+
+```
+make clean
+```
+
+## Testing the code
+
+To run static tests of the code style, run the following command:
+
+```
+make lint
+```
+
+Unit tests are also included in the module. To execute these, run the following
+command from this directory:
+
+```
+make test
+```
+
+To run individual tests, run the following:
+
+```
+(cd tests && ../env/bin/pytest <your_test>.py --cov=delphi_NAME --cov-report=term-missing)
+```
+
+The output will show the number of unit tests that passed and failed, along
+with the percentage of code covered by the tests. 
+
+None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and
+should not include critical sub-routines. 
diff --git a/nssp/REVIEW.md b/nssp/REVIEW.md
@@ -0,0 +1,38 @@
+## Code Review (Python)
+
+A code review of this module should include a careful look at the code and the
+output. To assist in the process, but certainly not in replace of it, please
+check the following items.
+
+**Documentation**
+
+- [ ] the README.md file template is filled out and currently accurate; it is
+possible to load and test the code using only the instructions given
+- [ ] minimal docstrings (one line describing what the function does) are
+included for all functions; full docstrings describing the inputs and expected
+outputs should be given for non-trivial functions
+
+**Structure**
+
+- [ ] code should pass lint checks (`make lint`)
+- [ ] any required metadata files are checked into the repository and placed
+within the directory `static`
+- [ ] any intermediate files that are created and stored by the module should
+be placed in the directory `cache`
+- [ ] final expected output files to be uploaded to the API are placed in the
+`receiving` directory; output files should not be committed to the respository
+- [ ] all options and API keys are passed through the file `params.json`
+- [ ] template parameter file (`params.json.template`) is checked into the
+code; no personal (i.e., usernames) or private (i.e., API keys) information is
+included in this template file
+
+**Testing**
+
+- [ ] module can be installed in a new virtual environment (`make install`)
+- [ ] reasonably high level of unit test coverage covering all of the main logic
+of the code (e.g., missing coverage for raised errors that do not currently seem
+possible to reach are okay; missing coverage for options that will be needed are
+not)
+- [ ] all unit tests run without errors (`make test`)
+- [ ] indicator directory has been added to GitHub CI
+(`covidcast-indicators/.github/workflows/python-ci.yml`)
diff --git a/nssp/cache/.gitignore b/nssp/cache/.gitignore
diff --git a/nssp/delphi_nssp/__init__.py b/nssp/delphi_nssp/__init__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""Module to pull and clean indicators from the NSSP source.
+
+This file defines the functions that are made public by the module. As the
+module is intended to be executed though the main method, these are primarily
+for testing.
+"""
+
+from __future__ import absolute_import
+
+from . import pull
+from . import run
+
+__version__ = "0.1.0"
diff --git a/nssp/delphi_nssp/__main__.py b/nssp/delphi_nssp/__main__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""Call the function run_module when executed.
+
+This file indicates that calling the module (`python -m delphi_NSSP`) will
+call the function `run_module` found within the run.py file. There should be
+no need to change this template.
+"""
+
+from delphi_utils import read_params
+from .run import run_module  # pragma: no cover
+
+run_module(read_params())  # pragma: no cover
diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py
@@ -0,0 +1,29 @@
+"""Registry for variations."""
+
+GEOS = [
+    "nation",
+    "state",
+    "county",
+]
+
+METRICS = ['percent_visits_covid','percent_visits_influenza',
+           'percent_visits_rsv','percent_visits_combined',
+           'percent_visits_smoothed_covid','percent_visits_smoothed_influenza',
+           'percent_visits_smoothed_rsv','percent_visits_smoothed_combined']
+
+SENSORS = ['percent_visits_covid','percent_visits_influenza',
+           'percent_visits_rsv','percent_visits_combined',
+           'smoothed_percent_visits_covid','smoothed_percent_visits_influenza',
+           'smoothed_percent_visits_rsv','smoothed_percent_visits_combined']
+
+NEWLINE = "\n"
+
+CSV_COLS = [
+            "geo_id",
+            "val",
+            "se",
+            "sample_size",
+            "missing_val",
+            "missing_se",
+            "missing_sample_size"
+        ]
diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+"""Functions for pulling NCHS mortality data API."""
+
+import numpy as np
+import pandas as pd
+from sodapy import Socrata
+
+from .constants import (
+    METRICS,
+    NEWLINE,
+)
+
+
+def construct_typedicts():
+    """Create the type conversion dictionary for dataframe."""
+    # basic type conversion
+    type_dict = {key: float for key in METRICS}
+    type_dict["timestamp"] = "datetime64[ns]"
+    type_dict["geography"] = str 
+    type_dict["county"] = str
+    type_dict["fips"] = int
+    return type_dict
+
+
+def warn_string(df, type_dict):
+    """Format the warning string."""
+    return f"""
+Expected column(s) missed, The dataset schema may
+have changed. Please investigate and amend the code.
+
+Columns needed:
+{NEWLINE.join(sorted(type_dict.keys()))}
+
+Columns available:
+{NEWLINE.join(sorted(df.columns))}
+"""
+
+
+def pull_nssp_data(socrata_token: str):
+    """Pull the latest NWSS Wastewater data, and conforms it into a dataset.
+
+    The output dataset has:
+
+    - Each row corresponds to a single observation
+    - Each row additionally has columns for the signals in METRICS
+
+    Parameters
+    ----------
+    socrata_token: str
+        My App Token for pulling the NWSS data (could be the same as the nchs data)
+    test_file: Optional[str]
+        When not null, name of file from which to read test data
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe as described above.
+    """
+    type_dict = construct_typedicts()
+
+    # Pull data from Socrata API
+    client = Socrata("data.cdc.gov", socrata_token)
+    results = []
+    offset = 0
+    limit = 50000  # maximum limit allowed by SODA 2.0
+    while True:
+        page = client.get("rdmq-nq56", limit=limit, offset=offset)
+        if not page:
+            break  # exit the loop if no more results
+        results.extend(page)
+        offset += limit
-    limit = 50000  # maximum limit allowed by SODA 2.0
-    while True:
-        page = client.get("rdmq-nq56", limit=limit, offset=offset)
-        if not page:
-            break  # exit the loop if no more results
-        results.extend(page)
-        offset += limit
+    limit = 50_000
+    for ii in range(100):
+        page = client.get("rdmq-nq56", limit=limit, offset=offset)
+        if not page:
+            max_ii = ii
+            break  # exit the loop if no more results
+        results.extend(page)
+        offset += limit
+    if max_ii == 100:
+        raise ValueError("client has pulled 100x the socrata limit")
+
-    limit = 50000  # maximum limit allowed by SODA 2.0
-    while True:
-        page = client.get("rdmq-nq56", limit=limit, offset=offset)
-        if not page:
-            break  # exit the loop if no more results
-        results.extend(page)
-        offset += limit
+    limit = 50_000
+    for ii in range(100):
+        page = client.get("rdmq-nq56", limit=limit, offset=offset)
+        if not page:
+            max_ii = ii
+            break  # exit the loop if no more results
+        results.extend(page)
+        offset += limit
+    if max_ii == 100:
+        raise ValueError("client has pulled 100x the socrata limit")
+
+    df_ervisits = pd.DataFrame.from_records(results)
+    df_ervisits = df_ervisits.rename(columns={"week_end": "timestamp", 
+                                              "percent_visits_smoothed":"percent_visits_smoothed_combined",
+                                              "percent_visits_smoothed_1":"percent_visits_smoothed_influenza",})
+
+    try:
+        df_ervisits = df_ervisits.astype(type_dict)
+    except KeyError as exc:
+        raise ValueError(warn_string(df_ervisits, type_dict)) from exc
+
+    keep_columns = ["timestamp", "geography", "county", "fips"]
+    return df_ervisits[METRICS + keep_columns]