-
Notifications
You must be signed in to change notification settings - Fork 16
2130 deal with large number of null values in nssp data #2141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
8fccf03
4a9554c
6c2bba8
a1211b3
9c34aa8
3ee0ae4
bd3ad70
bb0c62a
0674e32
7929b48
ec826cb
8c51c50
0635eae
8cec78f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
minhkhul marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
[ | ||
{ | ||
"week_end":"2022-10-15T00:00:00.000", | ||
"geography":"United States", | ||
"county":"All", | ||
"percent_visits_combined":"2.0", | ||
"percent_visits_covid":"1.63", | ||
"percent_visits_influenza":"0.17", | ||
"percent_visits_rsv":"0.21", | ||
"percent_visits_smoothed":"1.78", | ||
"percent_visits_smoothed_covid":"1.54", | ||
"percent_visits_smoothed_1":"0.12", | ||
"percent_visits_smoothed_rsv":"0.12", | ||
"ed_trends_covid":"Decreasing", | ||
"ed_trends_influenza":"No Change", | ||
"ed_trends_rsv":"Increasing", | ||
"hsa":"All", | ||
"hsa_counties":"All", | ||
"hsa_nci_id":"All", | ||
"fips":"0", | ||
"trend_source":"United States", | ||
"buildnumber":"2025-02-08" | ||
}, | ||
{ | ||
"week_end":"2022-10-15T00:00:00.000", | ||
"geography":"Colorado", | ||
"county":"Chaffee", | ||
"ed_trends_covid":"Data Unavailable", | ||
"ed_trends_influenza":"Data Unavailable", | ||
"ed_trends_rsv":"Data Unavailable", | ||
"hsa":"Chaffee, CO - Lake, CO", | ||
"hsa_counties":"Chaffee, Lake", | ||
"hsa_nci_id":"786", | ||
"fips":"8015", | ||
"trend_source":"HSA", | ||
"buildnumber":"2025-02-28" | ||
}, | ||
{ | ||
"week_end":"2022-10-15T00:00:00.000", | ||
"geography":"Colorado", | ||
"county":"Arapahoe", | ||
"ed_trends_covid":"Data Unavailable", | ||
"ed_trends_influenza":"Data Unavailable", | ||
"ed_trends_rsv":"Data Unavailable", | ||
"hsa":"Denver (Denver), CO - Jefferson, CO", | ||
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit", | ||
"hsa_nci_id":"688", | ||
"fips":"8005", | ||
"trend_source":"HSA", | ||
"buildnumber":"2025-03-28" | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,25 @@ | ||
import glob | ||
from datetime import datetime, date | ||
import json | ||
from pathlib import Path | ||
from unittest.mock import patch | ||
import tempfile | ||
import logging | ||
import os | ||
import time | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from delphi_nssp.constants import GEOS, SIGNALS_MAP | ||
from delphi_nssp.run import add_needed_columns | ||
from epiweeks import Week | ||
from pandas.testing import assert_frame_equal | ||
from delphi_nssp.constants import GEOS, SIGNALS, SIGNALS_MAP, DATASET_ID | ||
from delphi_nssp.run import ( | ||
add_needed_columns | ||
) | ||
|
||
|
||
def remove_backup_and_receiving(params): | ||
minhkhul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
export_dir = params["common"]["export_dir"] | ||
for file in Path(export_dir).glob("*.csv"): | ||
os.remove(file) | ||
|
||
today = pd.Timestamp.today().strftime("%Y%m%d") | ||
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*") | ||
for file in backup_dir: | ||
os.remove(file) | ||
|
||
class TestRun: | ||
def test_add_needed_columns(self): | ||
df = pd.DataFrame({"geo_id": ["us"], "val": [1]}) | ||
|
@@ -68,13 +70,10 @@ def test_output_files_exist(self, params, run_as_module): | |
] | ||
assert set(expected_columns).issubset(set(df.columns.values)) | ||
|
||
for file in Path(export_dir).glob("*.csv"): | ||
os.remove(file) | ||
# Verify that there's no NA/empty values in the val columns | ||
assert not df["val"].isnull().any() | ||
minhkhul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
today = pd.Timestamp.today().strftime("%Y%m%d") | ||
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*") | ||
for file in backup_dir: | ||
os.remove(file) | ||
remove_backup_and_receiving(params) | ||
|
||
def test_valid_hrr(self, run_as_module_hrr, params): | ||
export_dir = params["common"]["export_dir"] | ||
|
@@ -85,10 +84,23 @@ def test_valid_hrr(self, run_as_module_hrr, params): | |
df = pd.read_csv(f) | ||
assert (df.val == 100).all() | ||
|
||
for file in Path(export_dir).glob("*.csv"): | ||
os.remove(file) | ||
remove_backup_and_receiving(params) | ||
|
||
def test_empty_data(self, run_as_module_empty, params, caplog): | ||
""" | ||
Tests correct handling when there is a geo and signal combination that has no data. | ||
""" | ||
|
||
caplog.set_level(logging.WARNING) | ||
minhkhul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
run_as_module_empty() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you put There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @minhkhul and i had a chat... this suggestion doesnt work, possibly because of how fixtures behave in general or how we are using them. |
||
assert "No data for signal and geo combination" in caplog.text | ||
|
||
export_dir = params["common"]["export_dir"] | ||
csv_files = [f for f in Path(export_dir).glob("*.csv")] | ||
|
||
# Since only one national entry in page_no_data.json with numeric data, | ||
# while the two counties have no numeric fields, | ||
# there should be no county, hrr, hhs, or msa files. | ||
assert not any(geo in f.name for geo in ["county", "hrr", "hhs", "msa"] for f in csv_files) | ||
minhkhul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
today = pd.Timestamp.today().strftime("%Y%m%d") | ||
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*") | ||
for file in backup_dir: | ||
os.remove(file) | ||
remove_backup_and_receiving(params) |
Uh oh!
There was an error while loading. Please reload this page.