Skip to content

2130 deal with large number of null values in nssp data #2141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
7 changes: 7 additions & 0 deletions nssp/delphi_nssp/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ def run_module(params, logger=None):
missing_cols = set(CSV_COLS) - set(df.columns)
df = add_needed_columns(df, col_names=list(missing_cols))
df_csv = df[CSV_COLS + ["timestamp"]]

# remove rows with missing values
df_csv = df_csv[df_csv["val"].notnull()]
if df_csv.empty:
logger.warning("No data for signal and geo combination", signal=signal, geo=geo)
continue

# actual export
dates = create_export_csv(
df_csv,
Expand Down
34 changes: 34 additions & 0 deletions nssp/tests/test_data/page.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,39 @@
"fips": "8101",
"trend_source": "HSA",
"buildnumber": "2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"percent_visits_covid": "1",
"percent_visits_influenza": "1",
"percent_visits_rsv": "1",
"percent_visits_smoothed_covid": "1",
"percent_visits_smoothed_1": "1",
"percent_visits_smoothed_rsv": "1",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"Decreasing",
"ed_trends_rsv":"Decreasing",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
34 changes: 34 additions & 0 deletions nssp/tests/test_data/page_100_hrr.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,39 @@
"fips": "8101",
"trend_source": "HSA",
"buildnumber": "2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"percent_visits_covid": "100",
"percent_visits_influenza": "100",
"percent_visits_rsv": "100",
"percent_visits_smoothed_covid": "100",
"percent_visits_smoothed_1": "100",
"percent_visits_smoothed_rsv": "100",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"Decreasing",
"ed_trends_rsv":"Decreasing",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
52 changes: 52 additions & 0 deletions nssp/tests/test_data/page_no_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
[
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"United States",
"county":"All",
"percent_visits_combined":"2.0",
"percent_visits_covid":"1.63",
"percent_visits_influenza":"0.17",
"percent_visits_rsv":"0.21",
"percent_visits_smoothed":"1.78",
"percent_visits_smoothed_covid":"1.54",
"percent_visits_smoothed_1":"0.12",
"percent_visits_smoothed_rsv":"0.12",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"No Change",
"ed_trends_rsv":"Increasing",
"hsa":"All",
"hsa_counties":"All",
"hsa_nci_id":"All",
"fips":"0",
"trend_source":"United States",
"buildnumber":"2025-02-08"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
4 changes: 0 additions & 4 deletions nssp/tests/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,5 @@ def test_normal_pull_nssp_data(self, mock_socrata, params, caplog):
assert result["fips"].notnull().all(), "fips has rogue NaN"
assert result["fips"].apply(lambda x: isinstance(x, str) and len(x) != 4).all(), "fips formatting should always be 5 digits; include leading zeros if aplicable"

# Check for each signal in SIGNALS
for signal in SIGNALS:
assert result[signal].notnull().all(), f"{signal} has rogue NaN"

for file in backup_files:
os.remove(file)
69 changes: 44 additions & 25 deletions nssp/tests/test_run.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
import glob
from datetime import datetime, date
import json
import logging
import os
from pathlib import Path
import json
from unittest.mock import patch
import tempfile
import os
import time
from datetime import datetime

import numpy as np
import pandas as pd
from delphi_nssp.constants import GEOS, SIGNALS_MAP, DATASET_ID
from delphi_nssp.run import add_needed_columns, run_module
from epiweeks import Week
from pandas.testing import assert_frame_equal
from delphi_nssp.constants import GEOS, SIGNALS, SIGNALS_MAP, DATASET_ID
from delphi_nssp.run import (
add_needed_columns
)

TEST_DIR = Path(__file__).parent

def remove_backup_and_receiving(params):
export_dir = params["common"]["export_dir"]
for file in Path(export_dir).glob("*.csv"):
os.remove(file)

today = pd.Timestamp.today().strftime("%Y%m%d")
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
for file in backup_dir:
os.remove(file)

class TestRun:
def test_add_needed_columns(self):
Expand All @@ -42,7 +46,7 @@ def generate_week_file_prefix(self, dates):
]
return date_prefix

def test_output_files_exist(self, params, run_as_module):
def test_output_files(self, params, run_as_module):
export_dir = params["common"]["export_dir"]
csv_files = [f.name for f in Path(export_dir).glob("*.csv")]

Expand All @@ -68,13 +72,10 @@ def test_output_files_exist(self, params, run_as_module):
]
assert set(expected_columns).issubset(set(df.columns.values))

for file in Path(export_dir).glob("*.csv"):
os.remove(file)
# Verify that there's no NA/empty values in the val columns
assert not df["val"].isnull().any()

today = pd.Timestamp.today().strftime("%Y%m%d")
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
for file in backup_dir:
os.remove(file)
remove_backup_and_receiving(params)

def test_valid_hrr(self, run_as_module_hrr, params):
export_dir = params["common"]["export_dir"]
Expand All @@ -85,10 +86,28 @@ def test_valid_hrr(self, run_as_module_hrr, params):
df = pd.read_csv(f)
assert (df.val == 100).all()

for file in Path(export_dir).glob("*.csv"):
os.remove(file)
remove_backup_and_receiving(params)

@patch("sodapy.Socrata.get")
def test_empty_data(self, mock_get, params, caplog):
"""
Tests correct handling when there is a geo and signal combination that has no data.
"""

with open(f"{TEST_DIR}/test_data/page_no_data.json", "r") as f:
EMPTY_TEST_DATA = json.load(f)
mock_get.side_effect = [EMPTY_TEST_DATA, []]
run_module(params)

assert "No data for signal and geo combination" in caplog.text

export_dir = params["common"]["export_dir"]
csv_files = [f for f in Path(export_dir).glob("*.csv")]

# Since only one national entry in page_no_data.json with numeric data,
# while the two counties have no numeric fields,
# there should be no county, hrr, hhs, or msa files.
assert not any(geo in f.name for geo in ["county", "hrr", "hhs", "msa"] for f in csv_files)
assert all("nation" in f.name for f in csv_files)

today = pd.Timestamp.today().strftime("%Y%m%d")
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
for file in backup_dir:
os.remove(file)
remove_backup_and_receiving(params)