From 18315c795afda8c8006110a7d8d18fb6d0b18c2f Mon Sep 17 00:00:00 2001 From: minhkhul Date: Thu, 16 Jan 2025 10:58:54 -0500 Subject: [PATCH 1/4] delete secondary --- nssp/delphi_nssp/constants.py | 26 ----------------- nssp/delphi_nssp/pull.py | 54 ----------------------------------- nssp/delphi_nssp/run.py | 51 ++------------------------------- nssp/tests/test_pull.py | 44 ---------------------------- 4 files changed, 2 insertions(+), 173 deletions(-) diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py index 76d51b927..9b98d2012 100644 --- a/nssp/delphi_nssp/constants.py +++ b/nssp/delphi_nssp/constants.py @@ -41,29 +41,3 @@ "fips": str, } ) - -SECONDARY_COLS_MAP = { - "week_end": "timestamp", - "geography": "geo_value", - "percent_visits": "val", - "pathogen": "signal", -} - -SECONDARY_SIGNALS_MAP = { - "COVID-19": "pct_ed_visits_covid_2023RVR", - "Influenza": "pct_ed_visits_influenza_2023RVR", - "RSV": "pct_ed_visits_rsv_2023RVR", - "Combined": "pct_ed_visits_combined_2023RVR", -} - -SECONDARY_SIGNALS = [val for (key, val) in SECONDARY_SIGNALS_MAP.items()] -SECONDARY_GEOS = ["state", "nation", "hhs"] - -SECONDARY_TYPE_DICT = { - "timestamp": "datetime64[ns]", - "geo_value": str, - "val": float, - "geo_type": str, - "signal": str, -} -SECONDARY_KEEP_COLS = [key for (key, val) in SECONDARY_TYPE_DICT.items()] diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index 94058dea8..8fcc4da09 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -10,10 +10,6 @@ from .constants import ( NEWLINE, - SECONDARY_COLS_MAP, - SECONDARY_KEEP_COLS, - SECONDARY_SIGNALS_MAP, - SECONDARY_TYPE_DICT, SIGNALS, SIGNALS_MAP, TYPE_DICT, @@ -96,53 +92,3 @@ def pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger keep_columns = ["timestamp", "geography", "county", "fips"] return df_ervisits[SIGNALS + keep_columns] - - -def secondary_pull_nssp_data( - socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None -): - """Pull the latest NSSP ER visits secondary dataset. - - https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview - - The output dataset has: - - - Each row corresponds to a single observation - - Parameters - ---------- - socrata_token: str - My App Token for pulling the NSSP data (could be the same as the nchs data) - - Returns - ------- - pd.DataFrame - Dataframe as described above. - """ - socrata_results = pull_with_socrata_api(socrata_token, "7mra-9cq9") - df_ervisits = pd.DataFrame.from_records(socrata_results) - create_backup_csv(df_ervisits, backup_dir, custom_run, sensor="secondary", logger=logger) - df_ervisits = df_ervisits.rename(columns=SECONDARY_COLS_MAP) - - # geo_type is not provided in the dataset, so we infer it from the geo_value - # which is either state names, "National" or hhs region numbers - df_ervisits["geo_type"] = "state" - - df_ervisits.loc[df_ervisits["geo_value"] == "National", "geo_type"] = "nation" - - hhs_region_mask = df_ervisits["geo_value"].str.lower().str.startswith("region ") - df_ervisits.loc[hhs_region_mask, "geo_value"] = df_ervisits.loc[hhs_region_mask, "geo_value"].str.replace( - "Region ", "" - ) - df_ervisits.loc[hhs_region_mask, "geo_type"] = "hhs" - - df_ervisits["signal"] = df_ervisits["signal"].map(SECONDARY_SIGNALS_MAP) - - df_ervisits = df_ervisits[SECONDARY_KEEP_COLS] - - try: - df_ervisits = df_ervisits.astype(SECONDARY_TYPE_DICT) - except KeyError as exc: - raise ValueError(warn_string(df_ervisits, SECONDARY_TYPE_DICT)) from exc - - return df_ervisits diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 417c49ab2..7e069e483 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -31,8 +31,8 @@ from delphi_utils.geomap import GeoMapper from delphi_utils.nancodes import add_default_nancodes -from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SECONDARY_GEOS, SECONDARY_SIGNALS, SIGNALS -from .pull import pull_nssp_data, secondary_pull_nssp_data +from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SIGNALS +from .pull import pull_nssp_data def add_needed_columns(df, col_names=None): @@ -141,52 +141,5 @@ def run_module(params): if len(dates) > 0: run_stats.append((max(dates), len(dates))) - logger.info("Generating secondary signals") - secondary_df_pull = secondary_pull_nssp_data(socrata_token, backup_dir, custom_run, logger) - for signal in SECONDARY_SIGNALS: - secondary_df_pull_signal = secondary_df_pull[secondary_df_pull["signal"] == signal] - if secondary_df_pull_signal.empty: - logger.warning("No data found for signal", signal=signal) - continue - for geo in SECONDARY_GEOS: - df = secondary_df_pull_signal.copy() - logger.info("Generating signal and exporting to CSV", geo_type=geo, signal=signal) - if geo == "state": - df = df[(df["geo_type"] == "state")] - df["geo_id"] = df["geo_value"].apply( - lambda x: ( - us.states.lookup(x).abbr.lower() - if us.states.lookup(x) - else ("dc" if x == "District of Columbia" else x) - ) - ) - unexpected_state_names = df[df["geo_id"] == df["geo_value"]] - if unexpected_state_names.shape[0] > 0: - logger.error( - "Unexpected state names", - unexpected_state_names=unexpected_state_names["geo_value"].unique(), - ) - raise RuntimeError - elif geo == "nation": - df = df[(df["geo_type"] == "nation")] - df["geo_id"] = "us" - elif geo == "hhs": - df = df[(df["geo_type"] == "hhs")] - df["geo_id"] = df["geo_value"] - # add se, sample_size, and na codes - missing_cols = set(CSV_COLS) - set(df.columns) - df = add_needed_columns(df, col_names=list(missing_cols)) - df_csv = df[CSV_COLS + ["timestamp"]] - # actual export - dates = create_export_csv( - df_csv, - geo_res=geo, - export_dir=export_dir, - sensor=signal, - weekly_dates=True, - ) - if len(dates) > 0: - run_stats.append((max(dates), len(dates))) - ## log this indicator run logging(start_time, run_stats, logger) diff --git a/nssp/tests/test_pull.py b/nssp/tests/test_pull.py index 30debd6cd..a03221019 100644 --- a/nssp/tests/test_pull.py +++ b/nssp/tests/test_pull.py @@ -7,16 +7,11 @@ from delphi_nssp.pull import ( pull_nssp_data, - secondary_pull_nssp_data, pull_with_socrata_api, ) from delphi_nssp.constants import ( NEWLINE, - SECONDARY_COLS_MAP, - SECONDARY_KEEP_COLS, - SECONDARY_SIGNALS_MAP, - SECONDARY_TYPE_DICT, SIGNALS, SIGNALS_MAP, TYPE_DICT, @@ -81,44 +76,5 @@ def test_pull_nssp_data(self, mock_socrata, caplog): for file in backup_files: os.remove(file) - @patch("delphi_nssp.pull.Socrata") - def test_secondary_pull_nssp_data(self, mock_socrata): - today = pd.Timestamp.today().strftime("%Y%m%d") - backup_dir = 'test_raw_data_backups' - - # Load test data - with open("test_data/secondary_page.txt", "r") as f: - test_data = json.load(f) - - # Mock Socrata client and its get method - mock_client = MagicMock() - mock_client.get.side_effect = [test_data, []] # Return test data on first call, empty list on second call - mock_socrata.return_value = mock_client - - custom_run = False - logger = get_structured_logger() - # Call function with test token - test_token = "test_token" - result = secondary_pull_nssp_data(test_token, backup_dir, custom_run, logger) - # print(result) - - # Check that Socrata client was initialized with correct arguments - mock_socrata.assert_called_once_with("data.cdc.gov", test_token) - - # Check that get method was called with correct arguments - mock_client.get.assert_any_call("7mra-9cq9", limit=50000, offset=0) - - for col in SECONDARY_KEEP_COLS: - assert result[col].notnull().all(), f"{col} has rogue NaN" - - assert result[result['geo_value'].str.startswith('Region') ].empty, "'Region ' need to be removed from geo_value for geo_type 'hhs'" - assert (result[result['geo_type'] == 'nation']['geo_value'] == 'National').all(), "All rows with geo_type 'nation' must have geo_value 'National'" - - # Check that backup file was created - backup_files = glob.glob(f"{backup_dir}/{today}*") - assert len(backup_files) == 2, "Backup file was not created" - for file in backup_files: - os.remove(file) - if __name__ == "__main__": unittest.main() From 91692b6df3deaf09762ea6f22e23c27ca3633417 Mon Sep 17 00:00:00 2001 From: minhkhul Date: Thu, 16 Jan 2025 11:07:13 -0500 Subject: [PATCH 2/4] remove docs --- nssp/DETAILS.md | 15 ++------------- nssp/README.md | 4 +--- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index 692d85559..7200f0724 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -2,12 +2,8 @@ We import the NSSP Emergency Department Visit data, including percentage and smoothed percentage of ER visits attributable to a given pathogen, from the CDC website. The data is provided at the county level, state level and national level; we do a population-weighted mean to aggregate from county data up to the HRR and MSA levels. -There are 2 sources we grab data from for nssp: -- Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview -- Secondary (2023RVR) source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview -There are 8 signals output from the primary source and 4 output from secondary. There are no smoothed signals from secondary source. +NSSP source data: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview -Note that the data produced from secondary source are mostly the same as their primary source equivalent, with past analysis shows around 95% of datapoints having less than 0.1 value difference and the other 5% having a 0.1 to 1.2 value difference. ## Geographical Levels Primary source: @@ -16,15 +12,8 @@ Primary source: * `national`: just `us` for now, reported from source * `hhs`, `hrr`, `msa`: not reported from source, so we computed them from county-level data using a weighted mean. Each county is assigned a weight equal to its population in the last census (2020). -Secondary (2023RVR) source: -* `state`: reported from source -* `hhs`: reported from source -* `national`: reported from source - ## Metrics * `percent_visits_covid`, `percent_visits_rsv`, `percent_visits_influenza`: percentage of emergency department patient visits for specified pathogen. * `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid. * `smoothed_percent_visits_covid`, `smoothed_percent_visits_rsv`, `smoothed_percent_visits_influenza`: 3 week moving average of the percentage of emergency department patient visits for specified pathogen. -* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. -* `percent_visits_covid_2023RVR`, `percent_visits_rsv_2023RVR`, `percent_visits_influenza_2023RVR`: Taken from secondary source, percentage of emergency department patient visits for specified pathogen. -* `percent_visits_combined_2023RVR`: Taken from secondary source, sum of the three percentages of visits for flu, rsv and covid. +* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. \ No newline at end of file diff --git a/nssp/README.md b/nssp/README.md index c3f57b94b..d062771c0 100644 --- a/nssp/README.md +++ b/nssp/README.md @@ -2,9 +2,7 @@ We import the NSSP Emergency Department Visit data, currently only the smoothed concentration, from the CDC website, aggregate to the state and national level from the wastewater sample site level, and export the aggregated data. -There are 2 sources we grab data from for nssp: -- Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview -- Secondary source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview +NSSP source data: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview For details see the `DETAILS.md` file in this directory. From a33e2c4f89bbfdbe8068d35fa4df179b663722fd Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:19:50 -0500 Subject: [PATCH 3/4] Update nssp/DETAILS.md Co-authored-by: george --- nssp/DETAILS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index 7200f0724..77a7b429a 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -6,7 +6,6 @@ NSSP source data: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency ## Geographical Levels -Primary source: * `state`: reported from source using two-letter postal code * `county`: reported from source using fips code * `national`: just `us` for now, reported from source From 70a517b18cfd40cb920f7aada978c4b6dc2f8159 Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:19:56 -0500 Subject: [PATCH 4/4] Update nssp/DETAILS.md Co-authored-by: george --- nssp/DETAILS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index 77a7b429a..9bac16879 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -4,7 +4,6 @@ We import the NSSP Emergency Department Visit data, including percentage and smo NSSP source data: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview - ## Geographical Levels * `state`: reported from source using two-letter postal code * `county`: reported from source using fips code