Skip to content

2130 deal with large number of null values in nssp data #2141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
7 changes: 7 additions & 0 deletions nssp/delphi_nssp/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ def run_module(params, logger=None):
missing_cols = set(CSV_COLS) - set(df.columns)
df = add_needed_columns(df, col_names=list(missing_cols))
df_csv = df[CSV_COLS + ["timestamp"]]

# remove rows with missing values
df_csv = df_csv[df_csv["val"].notnull()]
if df_csv.empty:
logger.warning("No data for signal and geo combination", signal=signal, geo=geo)
continue

# actual export
dates = create_export_csv(
df_csv,
Expand Down
34 changes: 29 additions & 5 deletions nssp/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import copy
import json
import time
from unittest.mock import patch, MagicMock

import pytest
from pathlib import Path
from unittest.mock import patch

from delphi_nssp.run import run_module
import pytest
from delphi_nssp.constants import DATASET_ID
from delphi_nssp.run import run_module

TEST_DIR = Path(__file__).parent

Expand All @@ -20,6 +18,9 @@
with open(f"{TEST_DIR}/test_data/page_100_hrr.json", "r") as f:
HRR_TEST_DATA = json.load(f)

with open(f"{TEST_DIR}/test_data/page_no_data.json", "r") as f:
EMPTY_TEST_DATA = json.load(f)

@pytest.fixture(scope="session")
def params():
params = {
Expand Down Expand Up @@ -99,3 +100,26 @@ def side_effect(*args, **kwargs):
mock_get.side_effect = side_effect
run_module(params)

@pytest.fixture(scope="function")
def run_as_module_empty(params):
"""
Fixture to use EMPTY_TEST_DATA when testing run_module.

This fixture patches socrara to return the predefined test
data where relevent data is empty.
"""

def _run_as_module_empty():
with patch("sodapy.Socrata.get") as mock_get:

def side_effect(*args, **kwargs):
if kwargs["offset"] == 0:
if DATASET_ID in args[0]:
return EMPTY_TEST_DATA
else:
return []

mock_get.side_effect = side_effect
run_module(params)

return _run_as_module_empty
34 changes: 34 additions & 0 deletions nssp/tests/test_data/page.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,39 @@
"fips": "8101",
"trend_source": "HSA",
"buildnumber": "2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"percent_visits_covid": "1",
"percent_visits_influenza": "1",
"percent_visits_rsv": "1",
"percent_visits_smoothed_covid": "1",
"percent_visits_smoothed_1": "1",
"percent_visits_smoothed_rsv": "1",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"Decreasing",
"ed_trends_rsv":"Decreasing",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
34 changes: 34 additions & 0 deletions nssp/tests/test_data/page_100_hrr.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,39 @@
"fips": "8101",
"trend_source": "HSA",
"buildnumber": "2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"percent_visits_covid": "100",
"percent_visits_influenza": "100",
"percent_visits_rsv": "100",
"percent_visits_smoothed_covid": "100",
"percent_visits_smoothed_1": "100",
"percent_visits_smoothed_rsv": "100",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"Decreasing",
"ed_trends_rsv":"Decreasing",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
52 changes: 52 additions & 0 deletions nssp/tests/test_data/page_no_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
[
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"United States",
"county":"All",
"percent_visits_combined":"2.0",
"percent_visits_covid":"1.63",
"percent_visits_influenza":"0.17",
"percent_visits_rsv":"0.21",
"percent_visits_smoothed":"1.78",
"percent_visits_smoothed_covid":"1.54",
"percent_visits_smoothed_1":"0.12",
"percent_visits_smoothed_rsv":"0.12",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"No Change",
"ed_trends_rsv":"Increasing",
"hsa":"All",
"hsa_counties":"All",
"hsa_nci_id":"All",
"fips":"0",
"trend_source":"United States",
"buildnumber":"2025-02-08"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
4 changes: 0 additions & 4 deletions nssp/tests/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,5 @@ def test_normal_pull_nssp_data(self, mock_socrata, params, caplog):
assert result["fips"].notnull().all(), "fips has rogue NaN"
assert result["fips"].apply(lambda x: isinstance(x, str) and len(x) != 4).all(), "fips formatting should always be 5 digits; include leading zeros if aplicable"

# Check for each signal in SIGNALS
for signal in SIGNALS:
assert result[signal].notnull().all(), f"{signal} has rogue NaN"

for file in backup_files:
os.remove(file)
60 changes: 36 additions & 24 deletions nssp/tests/test_run.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import glob
from datetime import datetime, date
import json
from pathlib import Path
from unittest.mock import patch
import tempfile
import logging
import os
import time
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from delphi_nssp.constants import GEOS, SIGNALS_MAP
from delphi_nssp.run import add_needed_columns
from epiweeks import Week
from pandas.testing import assert_frame_equal
from delphi_nssp.constants import GEOS, SIGNALS, SIGNALS_MAP, DATASET_ID
from delphi_nssp.run import (
add_needed_columns
)


def remove_backup_and_receiving(params):
export_dir = params["common"]["export_dir"]
for file in Path(export_dir).glob("*.csv"):
os.remove(file)

today = pd.Timestamp.today().strftime("%Y%m%d")
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
for file in backup_dir:
os.remove(file)

class TestRun:
def test_add_needed_columns(self):
df = pd.DataFrame({"geo_id": ["us"], "val": [1]})
Expand Down Expand Up @@ -68,13 +70,10 @@ def test_output_files_exist(self, params, run_as_module):
]
assert set(expected_columns).issubset(set(df.columns.values))

for file in Path(export_dir).glob("*.csv"):
os.remove(file)
# Verify that there's no NA/empty values in the val columns
assert not df["val"].isnull().any()

today = pd.Timestamp.today().strftime("%Y%m%d")
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
for file in backup_dir:
os.remove(file)
remove_backup_and_receiving(params)

def test_valid_hrr(self, run_as_module_hrr, params):
export_dir = params["common"]["export_dir"]
Expand All @@ -85,10 +84,23 @@ def test_valid_hrr(self, run_as_module_hrr, params):
df = pd.read_csv(f)
assert (df.val == 100).all()

for file in Path(export_dir).glob("*.csv"):
os.remove(file)
remove_backup_and_receiving(params)

def test_empty_data(self, run_as_module_empty, params, caplog):
"""
Tests correct handling when there is a geo and signal combination that has no data.
"""

caplog.set_level(logging.WARNING)
run_as_module_empty()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you put caplog before run_as_module_empty in the method arguments list, i dont think you will need to have the encapsulated definition of _run_as_module_empty (and then you wont need to execute it explicitly here)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@minhkhul and i had a chat... this suggestion doesnt work, possibly because of how fixtures behave in general or how we are using them.

assert "No data for signal and geo combination" in caplog.text

export_dir = params["common"]["export_dir"]
csv_files = [f for f in Path(export_dir).glob("*.csv")]

# Since only one national entry in page_no_data.json with numeric data,
# while the two counties have no numeric fields,
# there should be no county, hrr, hhs, or msa files.
assert not any(geo in f.name for geo in ["county", "hrr", "hhs", "msa"] for f in csv_files)

today = pd.Timestamp.today().strftime("%Y%m%d")
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
for file in backup_dir:
os.remove(file)
remove_backup_and_receiving(params)
Loading