Skip to content

2130 deal with large number of null values in nssp data #2141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
9 changes: 8 additions & 1 deletion nssp/delphi_nssp/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def run_module(params, logger=None):
logger.warning("No primary source data pulled", issue_date=issue_date)
break
for geo in GEOS:
df = df_pull.copy()
df = df_pull.copy(deep=True)
df["val"] = df[signal]
logger.info("Generating signal and exporting to CSV", geo_type=geo, signal=signal)
if geo == "nation":
Expand Down Expand Up @@ -144,6 +144,13 @@ def run_module(params, logger=None):
missing_cols = set(CSV_COLS) - set(df.columns)
df = add_needed_columns(df, col_names=list(missing_cols))
df_csv = df[CSV_COLS + ["timestamp"]]

# remove rows with missing values
df_csv = df_csv[df_csv["val"].notnull()]
if df_csv.empty:
logger.warning("No data for signal and geo combination", signal=signal, geo=geo)
continue

# actual export
dates = create_export_csv(
df_csv,
Expand Down
36 changes: 36 additions & 0 deletions nssp/tests/test_data/page.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,41 @@
"fips": "8101",
"trend_source": "HSA",
"buildnumber": "2025-02-28"
},
{
"_comment":"This record is for testing the case where all signals data is NA for a county",
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"_comment":"This record is for testing the case where some signal data (combined signals) is NA for a county",
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"percent_visits_covid": "1",
"percent_visits_influenza": "1",
"percent_visits_rsv": "1",
"percent_visits_smoothed_covid": "1",
"percent_visits_smoothed_1": "1",
"percent_visits_smoothed_rsv": "1",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"Decreasing",
"ed_trends_rsv":"Decreasing",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
36 changes: 36 additions & 0 deletions nssp/tests/test_data/page_100_hrr.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,41 @@
"fips": "8101",
"trend_source": "HSA",
"buildnumber": "2025-02-28"
},
{
"_comment":"This record is for testing the case where all signals data is NA for a county",
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Chaffee",
"ed_trends_covid":"Data Unavailable",
"ed_trends_influenza":"Data Unavailable",
"ed_trends_rsv":"Data Unavailable",
"hsa":"Chaffee, CO - Lake, CO",
"hsa_counties":"Chaffee, Lake",
"hsa_nci_id":"786",
"fips":"8015",
"trend_source":"HSA",
"buildnumber":"2025-02-28"
},
{
"_comment":"This record is for testing the case where some signal data (combined signals) is NA for a county",
"week_end":"2022-10-15T00:00:00.000",
"geography":"Colorado",
"county":"Arapahoe",
"percent_visits_covid": "100",
"percent_visits_influenza": "100",
"percent_visits_rsv": "100",
"percent_visits_smoothed_covid": "100",
"percent_visits_smoothed_1": "100",
"percent_visits_smoothed_rsv": "100",
"ed_trends_covid":"Decreasing",
"ed_trends_influenza":"Decreasing",
"ed_trends_rsv":"Decreasing",
"hsa":"Denver (Denver), CO - Jefferson, CO",
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
"hsa_nci_id":"688",
"fips":"8005",
"trend_source":"HSA",
"buildnumber":"2025-03-28"
}
]
4 changes: 0 additions & 4 deletions nssp/tests/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,5 @@ def test_normal_pull_nssp_data(self, mock_socrata, params, caplog):
assert result["fips"].notnull().all(), "fips has rogue NaN"
assert result["fips"].apply(lambda x: isinstance(x, str) and len(x) != 4).all(), "fips formatting should always be 5 digits; include leading zeros if aplicable"

# Check for each signal in SIGNALS
for signal in SIGNALS:
assert result[signal].notnull().all(), f"{signal} has rogue NaN"

for file in backup_files:
os.remove(file)
3 changes: 3 additions & 0 deletions nssp/tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def test_output_files_exist(self, params, run_as_module):
]
assert set(expected_columns).issubset(set(df.columns.values))

#Verify that there's no NA/empty values in the val columns
assert not df["val"].isnull().any()

for file in Path(export_dir).glob("*.csv"):
os.remove(file)

Expand Down
Loading