From 7e2229a625273a7198a642de1d62664badd7b552 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Wed, 6 Nov 2024 17:18:39 -0500 Subject: [PATCH 01/18] base changes --- nssp/delphi_nssp/constants.py | 26 ++++++++++++++ nssp/delphi_nssp/pull.py | 68 +++++++++++++++++++++++++++++------ nssp/delphi_nssp/run.py | 45 +++++++++++++++++++++-- 3 files changed, 127 insertions(+), 12 deletions(-) diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py index 9b98d2012..b22dd1443 100644 --- a/nssp/delphi_nssp/constants.py +++ b/nssp/delphi_nssp/constants.py @@ -41,3 +41,29 @@ "fips": str, } ) + +SECONDARY_COLS_MAP = { + "week_end": "timestamp", + "geography": "geo_value", + "percent_visits": "val", + "pathogen": "signal", +} + +SECONDARY_SIGNALS_MAP = { + "COVID-19": "pct_ed_visits_covid_secondary", + "INFLUENZA": "pct_ed_visits_influenza_secondary", + "RSV": "pct_ed_visits_rsv_secondary", + "Combined": "pct_ed_visits_combined_secondary", +} + +SECONDARY_SIGNALS = [val for (key, val) in SECONDARY_SIGNALS_MAP.items()] +SECONDARY_GEOS = ["state","nation","hhs"] + +SECONDARY_TYPE_DICT = { + "timestamp": "datetime64[ns]", + "geo_value": str, + "val": float, + "geo_type": str, + "signal": str, +} +SECONDARY_KEEP_COLS = [key for (key, val) in SECONDARY_TYPE_DICT.items()] \ No newline at end of file diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index ece94fab4..5e6bd2b58 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -6,7 +6,7 @@ import pandas as pd from sodapy import Socrata -from .constants import NEWLINE, SIGNALS, SIGNALS_MAP, TYPE_DICT +from .constants import * def warn_string(df, type_dict): @@ -28,19 +28,13 @@ def warn_string(df, type_dict): def pull_nssp_data(socrata_token: str): - """Pull the latest NSSP ER visits data, and conforms it into a dataset. - - The output dataset has: - - - Each row corresponds to a single observation - - Each row additionally has columns for the signals in SIGNALS + """Pull the latest NSSP ER visits primary dataset + https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview Parameters ---------- socrata_token: str - My App Token for pulling the NWSS data (could be the same as the nchs data) - test_file: Optional[str] - When not null, name of file from which to read test data + My App Token for pulling the NSSP data (could be the same as the nchs data) Returns ------- @@ -72,3 +66,57 @@ def pull_nssp_data(socrata_token: str): keep_columns = ["timestamp", "geography", "county", "fips"] return df_ervisits[SIGNALS + keep_columns] + + +def secondary_pull_nssp_data(socrata_token: str): + """Pull the latest NSSP ER visits secondary dataset: + https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview + + The output dataset has: + + - Each row corresponds to a single observation + + Parameters + ---------- + socrata_token: str + My App Token for pulling the NSSP data (could be the same as the nchs data) + + Returns + ------- + pd.DataFrame + Dataframe as described above. + """ + # Pull data from Socrata API + client = Socrata("data.cdc.gov", socrata_token) + results = [] + offset = 0 + limit = 50000 # maximum limit allowed by SODA 2.0 + while True: + page = client.get("7mra-9cq9", limit=limit, offset=offset) + if not page: + break # exit the loop if no more results + results.extend(page) + offset += limit + df_ervisits = pd.DataFrame.from_records(results) + df_ervisits = df_ervisits.rename(columns=SECONDARY_COLS_MAP) + + # geo_type is not provided in the dataset, so we infer it from the geo_value + # which is either state names, "National" or hhs region numbers + df_ervisits['geo_type'] = 'state' + + df_ervisits.loc[df_ervisits['geo_value'] == 'National', 'geo_type'] = 'nation' + + hhs_region_mask = df_ervisits['geo_value'].str.startswith('Region ') + df_ervisits.loc[hhs_region_mask, 'geo_value'] = df_ervisits.loc[hhs_region_mask, 'geo_value'].str.replace('Region ', '') + df_ervisits.loc[hhs_region_mask, 'geo_type'] = 'hhs' + + df_ervisits['signal'] = df_ervisits['signal'].map(SECONDARY_SIGNALS_MAP) + + df_ervisits = df_ervisits[SECONDARY_KEEP_COLS] + + try: + df_ervisits = df_ervisits.astype(SECONDARY_TYPE_DICT) + except KeyError as exc: + raise ValueError(warn_string(df_ervisits, SECONDARY_TYPE_DICT)) from exc + + return df_ervisits diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index b22d03c20..1c950946f 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -31,8 +31,8 @@ from delphi_utils.geomap import GeoMapper from delphi_utils.nancodes import add_default_nancodes -from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SIGNALS -from .pull import pull_nssp_data +from .constants import * +from .pull import pull_nssp_data, secondary_pull_nssp_data def add_needed_columns(df, col_names=None): @@ -81,6 +81,7 @@ def run_module(params): socrata_token = params["indicator"]["socrata_token"] run_stats = [] + ## build the base version of the signal at the most detailed geo level you can get. ## compute stuff here or farm out to another function or file df_pull = pull_nssp_data(socrata_token) @@ -137,5 +138,45 @@ def run_module(params): if len(dates) > 0: run_stats.append((max(dates), len(dates))) + secondary_df_pull = secondary_pull_nssp_data(socrata_token) + ## aggregate + geo_mapper = GeoMapper() + for signal in SECONDARY_SIGNALS: + for geo in SECONDARY_GEOS: + df = secondary_df_pull.copy() + logger.info("Generating signal and exporting to CSV", geo_type=geo, signal=signal) + if geo == "state": + df = df[(df["geo_type"] == "state")] + df["geo_id"] = df["geo_value"].apply( + lambda x: ( + us.states.lookup(x).abbr.lower() if us.states.lookup(x) + else ("dc" if x == "District of Columbia" else x) + ) + ) + unexpected_state_names = df[df["geo_id"] == df["geo_value"]] + if unexpected_state_names.shape[0] > 0: + logger.error("Unexpected state names", df=unexpected_state_names) + exit(1) + elif geo == "nation": + df = df[(df["geo_type"] == "nation")] + df["geo_id"] = "us" + elif geo == "hhs": + df = df[(df["geo_type"] == "hhs")] + df["geo_id"] = df["geo_type"] + # add se, sample_size, and na codes + missing_cols = set(CSV_COLS) - set(df.columns) + df = add_needed_columns(df, col_names=list(missing_cols)) + df_csv = df[CSV_COLS + ["timestamp"]] + # actual export + dates = create_export_csv( + df_csv, + geo_res=geo, + export_dir=export_dir, + sensor=signal, + weekly_dates=True, + ) + if len(dates) > 0: + run_stats.append((max(dates), len(dates))) + ## log this indicator run logging(start_time, run_stats, logger) From 6325e09cded5603cec8192efd550300fb03593dd Mon Sep 17 00:00:00 2001 From: minhkhul Date: Wed, 6 Nov 2024 17:42:25 -0500 Subject: [PATCH 02/18] lint --- nssp/delphi_nssp/constants.py | 4 ++-- nssp/delphi_nssp/pull.py | 25 ++++++++++++++++++------- nssp/delphi_nssp/run.py | 8 +++++--- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py index b22dd1443..275b44c87 100644 --- a/nssp/delphi_nssp/constants.py +++ b/nssp/delphi_nssp/constants.py @@ -57,7 +57,7 @@ } SECONDARY_SIGNALS = [val for (key, val) in SECONDARY_SIGNALS_MAP.items()] -SECONDARY_GEOS = ["state","nation","hhs"] +SECONDARY_GEOS = ["state", "nation", "hhs"] SECONDARY_TYPE_DICT = { "timestamp": "datetime64[ns]", @@ -66,4 +66,4 @@ "geo_type": str, "signal": str, } -SECONDARY_KEEP_COLS = [key for (key, val) in SECONDARY_TYPE_DICT.items()] \ No newline at end of file +SECONDARY_KEEP_COLS = [key for (key, val) in SECONDARY_TYPE_DICT.items()] diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index 5e6bd2b58..bd809bc36 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -6,7 +6,16 @@ import pandas as pd from sodapy import Socrata -from .constants import * +from .constants import ( + NEWLINE, + SECONDARY_COLS_MAP, + SECONDARY_KEEP_COLS, + SECONDARY_SIGNALS_MAP, + SECONDARY_TYPE_DICT, + SIGNALS, + SIGNALS_MAP, + TYPE_DICT, +) def warn_string(df, type_dict): @@ -102,15 +111,17 @@ def secondary_pull_nssp_data(socrata_token: str): # geo_type is not provided in the dataset, so we infer it from the geo_value # which is either state names, "National" or hhs region numbers - df_ervisits['geo_type'] = 'state' + df_ervisits["geo_type"] = "state" - df_ervisits.loc[df_ervisits['geo_value'] == 'National', 'geo_type'] = 'nation' + df_ervisits.loc[df_ervisits["geo_value"] == "National", "geo_type"] = "nation" - hhs_region_mask = df_ervisits['geo_value'].str.startswith('Region ') - df_ervisits.loc[hhs_region_mask, 'geo_value'] = df_ervisits.loc[hhs_region_mask, 'geo_value'].str.replace('Region ', '') - df_ervisits.loc[hhs_region_mask, 'geo_type'] = 'hhs' + hhs_region_mask = df_ervisits["geo_value"].str.startswith("Region ") + df_ervisits.loc[hhs_region_mask, "geo_value"] = df_ervisits.loc[hhs_region_mask, "geo_value"].str.replace( + "Region ", "" + ) + df_ervisits.loc[hhs_region_mask, "geo_type"] = "hhs" - df_ervisits['signal'] = df_ervisits['signal'].map(SECONDARY_SIGNALS_MAP) + df_ervisits["signal"] = df_ervisits["signal"].map(SECONDARY_SIGNALS_MAP) df_ervisits = df_ervisits[SECONDARY_KEEP_COLS] diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 1c950946f..9c1587960 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -22,6 +22,7 @@ - "cache_dir": str, directory of locally cached data """ +import sys import time from datetime import datetime @@ -31,7 +32,7 @@ from delphi_utils.geomap import GeoMapper from delphi_utils.nancodes import add_default_nancodes -from .constants import * +from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SECONDARY_GEOS, SECONDARY_SIGNALS, SIGNALS from .pull import pull_nssp_data, secondary_pull_nssp_data @@ -149,14 +150,15 @@ def run_module(params): df = df[(df["geo_type"] == "state")] df["geo_id"] = df["geo_value"].apply( lambda x: ( - us.states.lookup(x).abbr.lower() if us.states.lookup(x) + us.states.lookup(x).abbr.lower() + if us.states.lookup(x) else ("dc" if x == "District of Columbia" else x) ) ) unexpected_state_names = df[df["geo_id"] == df["geo_value"]] if unexpected_state_names.shape[0] > 0: logger.error("Unexpected state names", df=unexpected_state_names) - exit(1) + sys.exit(1) elif geo == "nation": df = df[(df["geo_type"] == "nation")] df["geo_id"] = "us" From 99779c33fd7a7cbd79754d85f8d00aaf30fb4795 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Wed, 6 Nov 2024 17:49:24 -0500 Subject: [PATCH 03/18] lint --- nssp/delphi_nssp/pull.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index bd809bc36..1448b275e 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -37,7 +37,8 @@ def warn_string(df, type_dict): def pull_nssp_data(socrata_token: str): - """Pull the latest NSSP ER visits primary dataset + """Pull the latest NSSP ER visits primary dataset. + https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview Parameters @@ -78,7 +79,8 @@ def pull_nssp_data(socrata_token: str): def secondary_pull_nssp_data(socrata_token: str): - """Pull the latest NSSP ER visits secondary dataset: + """Pull the latest NSSP ER visits secondary dataset. + https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview The output dataset has: From 767b97e19a442c7e3185ec583bd6748ccbf83e83 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Wed, 6 Nov 2024 19:04:54 -0500 Subject: [PATCH 04/18] add test --- nssp/delphi_nssp/pull.py | 49 +-- nssp/tests/test_data/secondary_page.txt | 443 ++++++++++++++++++++++++ nssp/tests/test_pull.py | 36 +- 3 files changed, 506 insertions(+), 22 deletions(-) create mode 100644 nssp/tests/test_data/secondary_page.txt diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index 1448b275e..175577ee3 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -36,33 +36,50 @@ def warn_string(df, type_dict): return warn -def pull_nssp_data(socrata_token: str): - """Pull the latest NSSP ER visits primary dataset. - - https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview +def pull_with_socrata_api(socrata_token: str, dataset_id: str): + """Pull data from Socrata API. Parameters ---------- socrata_token: str My App Token for pulling the NSSP data (could be the same as the nchs data) + dataset_id: str + The dataset id to pull data from Returns ------- - pd.DataFrame - Dataframe as described above. + list of dictionaries, each representing a row in the dataset """ - # Pull data from Socrata API client = Socrata("data.cdc.gov", socrata_token) results = [] offset = 0 limit = 50000 # maximum limit allowed by SODA 2.0 while True: - page = client.get("rdmq-nq56", limit=limit, offset=offset) + page = client.get(dataset_id, limit=limit, offset=offset) if not page: break # exit the loop if no more results results.extend(page) offset += limit - df_ervisits = pd.DataFrame.from_records(results) + return results + + +def pull_nssp_data(socrata_token: str): + """Pull the latest NSSP ER visits primary dataset. + + https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview + + Parameters + ---------- + socrata_token: str + My App Token for pulling the NSSP data (could be the same as the nchs data) + + Returns + ------- + pd.DataFrame + Dataframe as described above. + """ + socrata_results = pull_with_socrata_api(socrata_token, "rdmq-nq56") + df_ervisits = pd.DataFrame.from_records(socrata_results) df_ervisits = df_ervisits.rename(columns={"week_end": "timestamp"}) df_ervisits = df_ervisits.rename(columns=SIGNALS_MAP) @@ -97,18 +114,8 @@ def secondary_pull_nssp_data(socrata_token: str): pd.DataFrame Dataframe as described above. """ - # Pull data from Socrata API - client = Socrata("data.cdc.gov", socrata_token) - results = [] - offset = 0 - limit = 50000 # maximum limit allowed by SODA 2.0 - while True: - page = client.get("7mra-9cq9", limit=limit, offset=offset) - if not page: - break # exit the loop if no more results - results.extend(page) - offset += limit - df_ervisits = pd.DataFrame.from_records(results) + socrata_results = pull_with_socrata_api(socrata_token, "7mra-9cq9") + df_ervisits = pd.DataFrame.from_records(socrata_results) df_ervisits = df_ervisits.rename(columns=SECONDARY_COLS_MAP) # geo_type is not provided in the dataset, so we infer it from the geo_value diff --git a/nssp/tests/test_data/secondary_page.txt b/nssp/tests/test_data/secondary_page.txt new file mode 100644 index 000000000..106473732 --- /dev/null +++ b/nssp/tests/test_data/secondary_page.txt @@ -0,0 +1,443 @@ +[ + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Michigan", + "percent_visits": "1.5", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Minnesota", + "percent_visits": "1.9", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Mississippi", + "percent_visits": "3.6", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Montana", + "percent_visits": "1.8", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "National", + "percent_visits": "2.5", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Nebraska", + "percent_visits": "1.4", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Nevada", + "percent_visits": "1.5", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "New Jersey", + "percent_visits": "1.8", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "New Mexico", + "percent_visits": "2.8", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "New York", + "percent_visits": "1.6", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "North Carolina", + "percent_visits": "3.5", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "North Dakota", + "percent_visits": "1.0", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Ohio", + "percent_visits": "1.8", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Oklahoma", + "percent_visits": "2.0", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Oregon", + "percent_visits": "2.6", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Pennsylvania", + "percent_visits": "1.5", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 1", + "percent_visits": "1.9", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 10", + "percent_visits": "2.5", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 2", + "percent_visits": "1.7", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 3", + "percent_visits": "2.1", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 4", + "percent_visits": "3.3", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 5", + "percent_visits": "1.9", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 6", + "percent_visits": "3.7", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 7", + "percent_visits": "1.0", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 8", + "percent_visits": "1.8", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Region 9", + "percent_visits": "2.4", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Rhode Island", + "percent_visits": "1.4", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "South Carolina", + "percent_visits": "3.4", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Tennessee", + "percent_visits": "2.3", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Texas", + "percent_visits": "3.9", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Utah", + "percent_visits": "1.8", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Vermont", + "percent_visits": "1.7", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Virginia", + "percent_visits": "2.8", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Washington", + "percent_visits": "2.5", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "West Virginia", + "percent_visits": "2.0", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "COVID-19", + "geography": "Wisconsin", + "percent_visits": "2.0", + "status": "Reporting", + "trend_on_date": "Increasing", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Alabama", + "percent_visits": "0.1", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Increasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Alaska", + "percent_visits": "0.2", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Arizona", + "percent_visits": "0.1", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Increasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Arkansas", + "percent_visits": "0.2", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "No Change" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "California", + "percent_visits": "0.1", + "status": "Reporting", + "trend_on_date": "Decreasing", + "recent_trend": "No Change" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Colorado", + "percent_visits": "0.1", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Increasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Connecticut", + "percent_visits": "0.0", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "No Change" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Delaware", + "percent_visits": "0.1", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "No Change" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "District of Columbia", + "percent_visits": "0.0", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "No Change" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Florida", + "percent_visits": "0.4", + "status": "Reporting", + "trend_on_date": "Decreasing", + "recent_trend": "Increasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Georgia", + "percent_visits": "0.1", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "No Change" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Hawaii", + "percent_visits": "1.0", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "Decreasing" + }, + { + "week_end": "2024-08-03T00:00:00.000", + "pathogen": "Influenza", + "geography": "Idaho", + "percent_visits": "0.0", + "status": "Reporting", + "trend_on_date": "No Change", + "recent_trend": "No Change" + } +] \ No newline at end of file diff --git a/nssp/tests/test_pull.py b/nssp/tests/test_pull.py index b356341f6..d9296e277 100644 --- a/nssp/tests/test_pull.py +++ b/nssp/tests/test_pull.py @@ -12,10 +12,16 @@ from delphi_nssp.pull import ( pull_nssp_data, + secondary_pull_nssp_data, + pull_with_socrata_api, ) from delphi_nssp.constants import ( - SIGNALS, NEWLINE, + SECONDARY_COLS_MAP, + SECONDARY_KEEP_COLS, + SECONDARY_SIGNALS_MAP, + SECONDARY_TYPE_DICT, + SIGNALS, SIGNALS_MAP, TYPE_DICT, ) @@ -55,6 +61,34 @@ def test_pull_nssp_data(self, mock_socrata): for signal in SIGNALS: assert result[signal].notnull().all(), f"{signal} has rogue NaN" + @patch("delphi_nssp.pull.Socrata") + def test_secondary_pull_nssp_data(self, mock_socrata): + # Load test data + with open("test_data/secondary_page.txt", "r") as f: + test_data = json.load(f) + + # Mock Socrata client and its get method + mock_client = MagicMock() + mock_client.get.side_effect = [test_data, []] # Return test data on first call, empty list on second call + mock_socrata.return_value = mock_client + + # Call function with test token + test_token = "test_token" + result = secondary_pull_nssp_data(test_token) + # print(result) + + # Check that Socrata client was initialized with correct arguments + mock_socrata.assert_called_once_with("data.cdc.gov", test_token) + + # Check that get method was called with correct arguments + mock_client.get.assert_any_call("7mra-9cq9", limit=50000, offset=0) + + for col in SECONDARY_KEEP_COLS: + assert result[col].notnull().all(), f"{col} has rogue NaN" + + assert result[result['geo_value'].str.startswith('Region') ].empty, "'Region ' need to be removed from geo_value for geo_type 'hhs'" + assert (result[result['geo_type'] == 'nation']['geo_value'] == 'National').all(), "All rows with geo_type 'nation' must have geo_value 'National'" + if __name__ == "__main__": unittest.main() From bb6c566259e640237f7b5ea1eac2b75e6c769248 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Thu, 7 Nov 2024 14:01:37 -0500 Subject: [PATCH 05/18] fix hhs bug + doc to readme + fix signal grouping --- nssp/DETAILS.md | 9 ++++++++- nssp/README.md | 5 +++++ nssp/delphi_nssp/run.py | 7 +++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index 539697baa..c20b9b22b 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -2,6 +2,11 @@ We import the NSSP Emergency Department Visit data, including percentage and smoothed percentage of ER visits attributable to a given pathogen, from the CDC website. The data is provided at the county level, state level and national level; we do a population-weighted mean to aggregate from county data up to the HRR and MSA levels. +There are 2 sources we grab data from for nssp: +- Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview +- Secondary source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview +There are 8 signals output from the primary source and 4 output from secondary. Secondary source data is only available from state-level geos and above, though secondary data might be updated more often. + ## Geographical Levels * `state`: reported using two-letter postal code * `county`: reported using fips code @@ -10,4 +15,6 @@ We import the NSSP Emergency Department Visit data, including percentage and smo * `percent_visits_covid`, `percent_visits_rsv`, `percent_visits_influenza`: percentage of emergency department patient visits for specified pathogen. * `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid. * `smoothed_percent_visits_covid`, `smoothed_percent_visits_rsv`, `smoothed_percent_visits_influenza`: 3 week moving average of the percentage of emergency department patient visits for specified pathogen. -* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. \ No newline at end of file +* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. +* `percent_visits_covid_secondary`, `percent_visits_rsv_secondary`, `percent_visits_influenza_secondary`: Taken from secondary source, percentage of emergency department patient visits for specified pathogen. +* `percent_visits_combined_secondary`: Taken from secondary source, sum of the three percentages of visits for flu, rsv and covid. \ No newline at end of file diff --git a/nssp/README.md b/nssp/README.md index 4bba6f626..c3f57b94b 100644 --- a/nssp/README.md +++ b/nssp/README.md @@ -1,6 +1,11 @@ # NSSP Emergency Department Visit data We import the NSSP Emergency Department Visit data, currently only the smoothed concentration, from the CDC website, aggregate to the state and national level from the wastewater sample site level, and export the aggregated data. + +There are 2 sources we grab data from for nssp: +- Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview +- Secondary source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview + For details see the `DETAILS.md` file in this directory. ## Create a MyAppToken diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 9c1587960..3a2a253d2 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -143,8 +143,11 @@ def run_module(params): ## aggregate geo_mapper = GeoMapper() for signal in SECONDARY_SIGNALS: + secondary_df_pull_signal = secondary_df_pull[secondary_df_pull["signal"] == signal] + if secondary_df_pull_signal.empty: + continue for geo in SECONDARY_GEOS: - df = secondary_df_pull.copy() + df = secondary_df_pull_signal.copy() logger.info("Generating signal and exporting to CSV", geo_type=geo, signal=signal) if geo == "state": df = df[(df["geo_type"] == "state")] @@ -164,7 +167,7 @@ def run_module(params): df["geo_id"] = "us" elif geo == "hhs": df = df[(df["geo_type"] == "hhs")] - df["geo_id"] = df["geo_type"] + df["geo_id"] = df["geo_value"] # add se, sample_size, and na codes missing_cols = set(CSV_COLS) - set(df.columns) df = add_needed_columns(df, col_names=list(missing_cols)) From 188c2f49b69ba4402a191dc6741e792a90488174 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Thu, 7 Nov 2024 14:25:55 -0500 Subject: [PATCH 06/18] fix weird nan --- nssp/delphi_nssp/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py index 275b44c87..85cefe1d9 100644 --- a/nssp/delphi_nssp/constants.py +++ b/nssp/delphi_nssp/constants.py @@ -51,7 +51,7 @@ SECONDARY_SIGNALS_MAP = { "COVID-19": "pct_ed_visits_covid_secondary", - "INFLUENZA": "pct_ed_visits_influenza_secondary", + "Influenza": "pct_ed_visits_influenza_secondary", "RSV": "pct_ed_visits_rsv_secondary", "Combined": "pct_ed_visits_combined_secondary", } From 21d9732343ad6c57057b9c68f4d15791c05901e4 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Thu, 7 Nov 2024 14:53:53 -0500 Subject: [PATCH 07/18] logging + error details --- nssp/delphi_nssp/run.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 3a2a253d2..83b8c0c7c 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -83,6 +83,7 @@ def run_module(params): run_stats = [] + logger.info("Generating primary signals") ## build the base version of the signal at the most detailed geo level you can get. ## compute stuff here or farm out to another function or file df_pull = pull_nssp_data(socrata_token) @@ -139,9 +140,8 @@ def run_module(params): if len(dates) > 0: run_stats.append((max(dates), len(dates))) + logger.info("Generating secondary signals") secondary_df_pull = secondary_pull_nssp_data(socrata_token) - ## aggregate - geo_mapper = GeoMapper() for signal in SECONDARY_SIGNALS: secondary_df_pull_signal = secondary_df_pull[secondary_df_pull["signal"] == signal] if secondary_df_pull_signal.empty: @@ -160,8 +160,7 @@ def run_module(params): ) unexpected_state_names = df[df["geo_id"] == df["geo_value"]] if unexpected_state_names.shape[0] > 0: - logger.error("Unexpected state names", df=unexpected_state_names) - sys.exit(1) + raise RuntimeError(f"Unexpected state names: {unexpected_state_names}") elif geo == "nation": df = df[(df["geo_type"] == "nation")] df["geo_id"] = "us" From 852f6d9e4d7117b7b8ec843f50ed4b1eefc7371f Mon Sep 17 00:00:00 2001 From: minhkhul Date: Thu, 7 Nov 2024 17:10:40 -0500 Subject: [PATCH 08/18] test data sync --- nssp/delphi_nssp/run.py | 7 +- nssp/tests/test_data/secondary_page.txt | 436 ++---------------------- 2 files changed, 38 insertions(+), 405 deletions(-) diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 83b8c0c7c..2d389c305 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -22,7 +22,6 @@ - "cache_dir": str, directory of locally cached data """ -import sys import time from datetime import datetime @@ -160,7 +159,11 @@ def run_module(params): ) unexpected_state_names = df[df["geo_id"] == df["geo_value"]] if unexpected_state_names.shape[0] > 0: - raise RuntimeError(f"Unexpected state names: {unexpected_state_names}") + logger.error( + "Unexpected state names", + unexpected_state_names=unexpected_state_names["geo_value"].unique(), + ) + raise RuntimeError elif geo == "nation": df = df[(df["geo_type"] == "nation")] df["geo_id"] = "us" diff --git a/nssp/tests/test_data/secondary_page.txt b/nssp/tests/test_data/secondary_page.txt index 106473732..4b4aaaca7 100644 --- a/nssp/tests/test_data/secondary_page.txt +++ b/nssp/tests/test_data/secondary_page.txt @@ -1,443 +1,73 @@ [ { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Michigan", - "percent_visits": "1.5", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Minnesota", - "percent_visits": "1.9", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Mississippi", - "percent_visits": "3.6", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Montana", + "week_end": "2022-10-01T00:00:00.000", + "pathogen": "COVID-19", "geography": "National", "percent_visits": "1.8", "status": "Reporting", - "trend_on_date": "No Change", + "trend_on_date": "Decreasing", "recent_trend": "Decreasing" }, { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", + "week_end": "2022-10-01T00:00:00.000", + "pathogen": "Influenza", "geography": "National", - "percent_visits": "2.5", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Nebraska", - "percent_visits": "1.4", + "percent_visits": "0.5", "status": "Reporting", "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Nevada", - "percent_visits": "1.5", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "Decreasing" + "recent_trend": "Increasing" }, { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "New Jersey", - "percent_visits": "1.8", + "week_end": "2022-10-01T00:00:00.000", + "pathogen": "RSV", + "geography": "National", + "percent_visits": "0.5", "status": "Reporting", "trend_on_date": "Increasing", - "recent_trend": "Decreasing" + "recent_trend": "Increasing" }, { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "New Mexico", + "week_end": "2022-10-01T00:00:00.000", + "pathogen": "Combined", + "geography": "National", "percent_visits": "2.8", "status": "Reporting", - "trend_on_date": "Increasing", + "trend_on_date": "Decreasing", "recent_trend": "Decreasing" }, { - "week_end": "2024-08-03T00:00:00.000", + "week_end": "2022-10-15T00:00:00.000", "pathogen": "COVID-19", - "geography": "New York", + "geography": "National", "percent_visits": "1.6", "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "North Carolina", - "percent_visits": "3.5", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "North Dakota", - "percent_visits": "1.0", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Ohio", - "percent_visits": "1.8", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Oklahoma", - "percent_visits": "2.0", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Oregon", - "percent_visits": "2.6", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Pennsylvania", - "percent_visits": "1.5", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 1", - "percent_visits": "1.9", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 10", - "percent_visits": "2.5", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 2", - "percent_visits": "1.7", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 3", - "percent_visits": "2.1", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 4", - "percent_visits": "3.3", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 5", - "percent_visits": "1.9", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 6", - "percent_visits": "3.7", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 7", - "percent_visits": "1.0", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 8", - "percent_visits": "1.8", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Region 9", - "percent_visits": "2.4", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Rhode Island", - "percent_visits": "1.4", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "South Carolina", - "percent_visits": "3.4", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Tennessee", - "percent_visits": "2.3", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Texas", - "percent_visits": "3.9", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Utah", - "percent_visits": "1.8", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Vermont", - "percent_visits": "1.7", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Virginia", - "percent_visits": "2.8", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Washington", - "percent_visits": "2.5", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "West Virginia", - "percent_visits": "2.0", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "COVID-19", - "geography": "Wisconsin", - "percent_visits": "2.0", - "status": "Reporting", - "trend_on_date": "Increasing", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Alabama", - "percent_visits": "0.1", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "Increasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Alaska", - "percent_visits": "0.2", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Arizona", - "percent_visits": "0.1", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "Increasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Arkansas", - "percent_visits": "0.2", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "No Change" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "California", - "percent_visits": "0.1", - "status": "Reporting", "trend_on_date": "Decreasing", - "recent_trend": "No Change" + "recent_trend": "Decreasing" }, { - "week_end": "2024-08-03T00:00:00.000", + "week_end": "2022-10-15T00:00:00.000", "pathogen": "Influenza", - "geography": "Colorado", - "percent_visits": "0.1", + "geography": "National", + "percent_visits": "0.9", "status": "Reporting", - "trend_on_date": "No Change", + "trend_on_date": "Increasing", "recent_trend": "Increasing" }, { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Connecticut", - "percent_visits": "0.0", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "No Change" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Delaware", - "percent_visits": "0.1", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "No Change" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "District of Columbia", - "percent_visits": "0.0", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "No Change" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Florida", - "percent_visits": "0.4", + "week_end": "2022-10-15T00:00:00.000", + "pathogen": "RSV", + "geography": "National", + "percent_visits": "0.7", "status": "Reporting", - "trend_on_date": "Decreasing", + "trend_on_date": "Increasing", "recent_trend": "Increasing" }, { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Georgia", - "percent_visits": "0.1", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "No Change" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Hawaii", - "percent_visits": "1.0", + "week_end": "2022-10-15T00:00:00.000", + "pathogen": "Combined", + "geography": "National", + "percent_visits": "3.2", "status": "Reporting", - "trend_on_date": "No Change", + "trend_on_date": "Increasing", "recent_trend": "Decreasing" - }, - { - "week_end": "2024-08-03T00:00:00.000", - "pathogen": "Influenza", - "geography": "Idaho", - "percent_visits": "0.0", - "status": "Reporting", - "trend_on_date": "No Change", - "recent_trend": "No Change" } ] \ No newline at end of file From a8e5c89c06ac3193200cc0e6f11b98f7def9da9d Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Fri, 8 Nov 2024 11:45:28 -0500 Subject: [PATCH 09/18] typo in pull.py --- nssp/delphi_nssp/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index 792157b15..d79ddd131 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -65,7 +65,7 @@ def pull_with_socrata_api(socrata_token: str, dataset_id: str): return results -def pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None):: +def pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None): """Pull the latest NSSP ER visits primary dataset. https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview From bbe135bb6425e6163b66b55a46f003287448c263 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 10:17:51 -0500 Subject: [PATCH 10/18] region to lower --- nssp/delphi_nssp/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index d79ddd131..3033b1eff 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -127,7 +127,7 @@ def secondary_pull_nssp_data(socrata_token: str): df_ervisits.loc[df_ervisits["geo_value"] == "National", "geo_type"] = "nation" - hhs_region_mask = df_ervisits["geo_value"].str.startswith("Region ") + hhs_region_mask = df_ervisits["geo_value"].str.lower().startswith("region ") df_ervisits.loc[hhs_region_mask, "geo_value"] = df_ervisits.loc[hhs_region_mask, "geo_value"].str.replace( "Region ", "" ) From b9df78f002f8dc3e2ffe1d57a21708eb98512c36 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 10:19:08 -0500 Subject: [PATCH 11/18] add log --- nssp/delphi_nssp/run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index feddd0b90..189afe60b 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -146,6 +146,7 @@ def run_module(params): for signal in SECONDARY_SIGNALS: secondary_df_pull_signal = secondary_df_pull[secondary_df_pull["signal"] == signal] if secondary_df_pull_signal.empty: + logger.warning("No data found for signal", signal=signal) continue for geo in SECONDARY_GEOS: df = secondary_df_pull_signal.copy() From ae5d623a2fbc74c361051376c2932028891b3eaf Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 15:59:56 -0500 Subject: [PATCH 12/18] fix str bug --- nssp/delphi_nssp/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index 3033b1eff..c6973ac6f 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -127,7 +127,7 @@ def secondary_pull_nssp_data(socrata_token: str): df_ervisits.loc[df_ervisits["geo_value"] == "National", "geo_type"] = "nation" - hhs_region_mask = df_ervisits["geo_value"].str.lower().startswith("region ") + hhs_region_mask = df_ervisits["geo_value"].str.lower().str.startswith("region ") df_ervisits.loc[hhs_region_mask, "geo_value"] = df_ervisits.loc[hhs_region_mask, "geo_value"].str.replace( "Region ", "" ) From 9e95adb544d7fca5cf58ccbbbcb1095b6d755f29 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 16:36:44 -0500 Subject: [PATCH 13/18] add backup data mechanism --- nssp/delphi_nssp/pull.py | 3 ++- nssp/delphi_nssp/run.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index c6973ac6f..b5c93e200 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -98,7 +98,7 @@ def pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger return df_ervisits[SIGNALS + keep_columns] -def secondary_pull_nssp_data(socrata_token: str): +def secondary_pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None): """Pull the latest NSSP ER visits secondary dataset. https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview @@ -119,6 +119,7 @@ def secondary_pull_nssp_data(socrata_token: str): """ socrata_results = pull_with_socrata_api(socrata_token, "7mra-9cq9") df_ervisits = pd.DataFrame.from_records(socrata_results) + create_backup_csv(df_ervisits, backup_dir, custom_run, sensor="secondary", logger=logger) df_ervisits = df_ervisits.rename(columns=SECONDARY_COLS_MAP) # geo_type is not provided in the dataset, so we infer it from the geo_value diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 189afe60b..417c49ab2 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -142,7 +142,7 @@ def run_module(params): run_stats.append((max(dates), len(dates))) logger.info("Generating secondary signals") - secondary_df_pull = secondary_pull_nssp_data(socrata_token) + secondary_df_pull = secondary_pull_nssp_data(socrata_token, backup_dir, custom_run, logger) for signal in SECONDARY_SIGNALS: secondary_df_pull_signal = secondary_df_pull[secondary_df_pull["signal"] == signal] if secondary_df_pull_signal.empty: From 9feccefd1d72ea8b4b706b891859858da4e7c9c2 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 16:45:25 -0500 Subject: [PATCH 14/18] adjust details.md --- nssp/DETAILS.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index c20b9b22b..860ccd695 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -5,12 +5,22 @@ We import the NSSP Emergency Department Visit data, including percentage and smo There are 2 sources we grab data from for nssp: - Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview - Secondary source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview -There are 8 signals output from the primary source and 4 output from secondary. Secondary source data is only available from state-level geos and above, though secondary data might be updated more often. +There are 8 signals output from the primary source and 4 output from secondary. There are no smoothed signals from secondary source. + +Note that the data produced from secondary source are mostly the same as their primary source equivalent, with past analysis shows around 95% of datapoints having less than 0.1 value difference and the other 5% having a 0.1 to 1.2 value difference. ## Geographical Levels -* `state`: reported using two-letter postal code -* `county`: reported using fips code -* `national`: just `us` for now +Primary source: +* `state`: reported from source using two-letter postal code +* `county`: reported from source using fips code +* `national`: just `us` for now, reported from source +* `hhs`, `hrr`, `msa`: not reported from source, so we computed them from county-level data using a weighted mean. Each county is assigned a weight equal to its population in the last census (2020). + +Secondary source: +* `state`: reported from source +* `hhs`: reported from source +* `national`: reported from source + ## Metrics * `percent_visits_covid`, `percent_visits_rsv`, `percent_visits_influenza`: percentage of emergency department patient visits for specified pathogen. * `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid. From e89a2998fd22aa4dade0e98efb64cdc185c6620b Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 17:26:13 -0500 Subject: [PATCH 15/18] appease linter --- nssp/delphi_nssp/pull.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index b5c93e200..94058dea8 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -98,7 +98,9 @@ def pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger return df_ervisits[SIGNALS + keep_columns] -def secondary_pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None): +def secondary_pull_nssp_data( + socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None +): """Pull the latest NSSP ER visits secondary dataset. https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview From f3cb5090106c80423c8900cbb4672e26004dfa5a Mon Sep 17 00:00:00 2001 From: minhkhul Date: Mon, 11 Nov 2024 17:36:22 -0500 Subject: [PATCH 16/18] add tests for secondary source backup --- nssp/tests/test_pull.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/nssp/tests/test_pull.py b/nssp/tests/test_pull.py index b07d47466..30debd6cd 100644 --- a/nssp/tests/test_pull.py +++ b/nssp/tests/test_pull.py @@ -83,6 +83,9 @@ def test_pull_nssp_data(self, mock_socrata, caplog): @patch("delphi_nssp.pull.Socrata") def test_secondary_pull_nssp_data(self, mock_socrata): + today = pd.Timestamp.today().strftime("%Y%m%d") + backup_dir = 'test_raw_data_backups' + # Load test data with open("test_data/secondary_page.txt", "r") as f: test_data = json.load(f) @@ -92,9 +95,11 @@ def test_secondary_pull_nssp_data(self, mock_socrata): mock_client.get.side_effect = [test_data, []] # Return test data on first call, empty list on second call mock_socrata.return_value = mock_client + custom_run = False + logger = get_structured_logger() # Call function with test token test_token = "test_token" - result = secondary_pull_nssp_data(test_token) + result = secondary_pull_nssp_data(test_token, backup_dir, custom_run, logger) # print(result) # Check that Socrata client was initialized with correct arguments @@ -109,6 +114,11 @@ def test_secondary_pull_nssp_data(self, mock_socrata): assert result[result['geo_value'].str.startswith('Region') ].empty, "'Region ' need to be removed from geo_value for geo_type 'hhs'" assert (result[result['geo_type'] == 'nation']['geo_value'] == 'National').all(), "All rows with geo_type 'nation' must have geo_value 'National'" + # Check that backup file was created + backup_files = glob.glob(f"{backup_dir}/{today}*") + assert len(backup_files) == 2, "Backup file was not created" + for file in backup_files: + os.remove(file) if __name__ == "__main__": unittest.main() From a52afb17045b426b041cd7a3a0b1cfc9537efdac Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Sun, 17 Nov 2024 14:35:54 -0500 Subject: [PATCH 17/18] Update signal names to _2023RVR in constants.py --- nssp/delphi_nssp/constants.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py index 85cefe1d9..76d51b927 100644 --- a/nssp/delphi_nssp/constants.py +++ b/nssp/delphi_nssp/constants.py @@ -50,10 +50,10 @@ } SECONDARY_SIGNALS_MAP = { - "COVID-19": "pct_ed_visits_covid_secondary", - "Influenza": "pct_ed_visits_influenza_secondary", - "RSV": "pct_ed_visits_rsv_secondary", - "Combined": "pct_ed_visits_combined_secondary", + "COVID-19": "pct_ed_visits_covid_2023RVR", + "Influenza": "pct_ed_visits_influenza_2023RVR", + "RSV": "pct_ed_visits_rsv_2023RVR", + "Combined": "pct_ed_visits_combined_2023RVR", } SECONDARY_SIGNALS = [val for (key, val) in SECONDARY_SIGNALS_MAP.items()] From 6cfd4f4a580a5fbd03e5507cba3618c7b0a8e3b1 Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Sun, 17 Nov 2024 14:38:34 -0500 Subject: [PATCH 18/18] clarify _2023RVR signals in DETAILS.md --- nssp/DETAILS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index 860ccd695..692d85559 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -4,7 +4,7 @@ We import the NSSP Emergency Department Visit data, including percentage and smo There are 2 sources we grab data from for nssp: - Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview -- Secondary source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview +- Secondary (2023RVR) source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview There are 8 signals output from the primary source and 4 output from secondary. There are no smoothed signals from secondary source. Note that the data produced from secondary source are mostly the same as their primary source equivalent, with past analysis shows around 95% of datapoints having less than 0.1 value difference and the other 5% having a 0.1 to 1.2 value difference. @@ -16,7 +16,7 @@ Primary source: * `national`: just `us` for now, reported from source * `hhs`, `hrr`, `msa`: not reported from source, so we computed them from county-level data using a weighted mean. Each county is assigned a weight equal to its population in the last census (2020). -Secondary source: +Secondary (2023RVR) source: * `state`: reported from source * `hhs`: reported from source * `national`: reported from source @@ -26,5 +26,5 @@ Secondary source: * `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid. * `smoothed_percent_visits_covid`, `smoothed_percent_visits_rsv`, `smoothed_percent_visits_influenza`: 3 week moving average of the percentage of emergency department patient visits for specified pathogen. * `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. -* `percent_visits_covid_secondary`, `percent_visits_rsv_secondary`, `percent_visits_influenza_secondary`: Taken from secondary source, percentage of emergency department patient visits for specified pathogen. -* `percent_visits_combined_secondary`: Taken from secondary source, sum of the three percentages of visits for flu, rsv and covid. \ No newline at end of file +* `percent_visits_covid_2023RVR`, `percent_visits_rsv_2023RVR`, `percent_visits_influenza_2023RVR`: Taken from secondary source, percentage of emergency department patient visits for specified pathogen. +* `percent_visits_combined_2023RVR`: Taken from secondary source, sum of the three percentages of visits for flu, rsv and covid.