Skip to content

first implimentation of rsv #2121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion nhsn/delphi_nhsn/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,36 @@
# column name from socrata
TOTAL_ADMISSION_COVID_COL = "totalconfc19newadm"
TOTAL_ADMISSION_FLU_COL = "totalconfflunewadm"
TOTAL_ADMISSION_RSV_COL = "totalconfrsvnewadm"
NUM_HOSP_REPORTING_COVID_COL = "totalconfc19newadmhosprep"
NUM_HOSP_REPORTING_FLU_COL = "totalconfflunewadmhosprep"

NUM_HOSP_REPORTING_RSV_COL = "totalconfrsvnewadmhosprep"
# signal name
TOTAL_ADMISSION_COVID = "confirmed_admissions_covid_ew"
TOTAL_ADMISSION_FLU = "confirmed_admissions_flu_ew"
TOTAL_ADMISSION_RSV = "confirmed_admissions_rsv_ew"
NUM_HOSP_REPORTING_COVID = "hosprep_confirmed_admissions_covid_ew"
NUM_HOSP_REPORTING_FLU = "hosprep_confirmed_admissions_flu_ew"
NUM_HOSP_REPORTING_RSV = "hosprep_confirmed_admissions_rsv_ew"

SIGNALS_MAP = {
TOTAL_ADMISSION_COVID: TOTAL_ADMISSION_COVID_COL,
TOTAL_ADMISSION_FLU: TOTAL_ADMISSION_FLU_COL,
TOTAL_ADMISSION_RSV: TOTAL_ADMISSION_RSV_COL,
NUM_HOSP_REPORTING_COVID: NUM_HOSP_REPORTING_COVID_COL,
NUM_HOSP_REPORTING_FLU: NUM_HOSP_REPORTING_FLU_COL,
NUM_HOSP_REPORTING_RSV: NUM_HOSP_REPORTING_RSV_COL,
}

TYPE_DICT = {
"timestamp": "datetime64[ns]",
"geo_id": str,
TOTAL_ADMISSION_COVID: float,
TOTAL_ADMISSION_FLU: float,
TOTAL_ADMISSION_RSV: float,
NUM_HOSP_REPORTING_COVID: float,
NUM_HOSP_REPORTING_FLU: float,
NUM_HOSP_REPORTING_RSV: float,
}

# signal mapping for secondary, preliminary source
Expand All @@ -39,15 +46,19 @@
PRELIM_SIGNALS_MAP = {
f"{TOTAL_ADMISSION_COVID}_prelim": TOTAL_ADMISSION_COVID_COL,
f"{TOTAL_ADMISSION_FLU}_prelim": TOTAL_ADMISSION_FLU_COL,
f"{TOTAL_ADMISSION_RSV}_prelim": TOTAL_ADMISSION_RSV_COL,
f"{NUM_HOSP_REPORTING_COVID}_prelim": NUM_HOSP_REPORTING_COVID_COL,
f"{NUM_HOSP_REPORTING_FLU}_prelim": NUM_HOSP_REPORTING_FLU_COL,
f"{NUM_HOSP_REPORTING_RSV}_prelim": NUM_HOSP_REPORTING_RSV_COL,
}

PRELIM_TYPE_DICT = {
"timestamp": "datetime64[ns]",
"geo_id": str,
f"{TOTAL_ADMISSION_COVID}_prelim": float,
f"{TOTAL_ADMISSION_FLU}_prelim": float,
f"{TOTAL_ADMISSION_RSV}_prelim": float,
f"{NUM_HOSP_REPORTING_COVID}_prelim": float,
f"{NUM_HOSP_REPORTING_FLU}_prelim": float,
f"{NUM_HOSP_REPORTING_RSV}_prelim": float,
}
92 changes: 21 additions & 71 deletions nhsn/delphi_nhsn/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ def check_last_updated(socrata_token, dataset_id, logger):
def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger):
"""Pull data from Socrata API."""
client = Socrata("data.cdc.gov", socrata_token)
logger.info("Pulling data from Socrata API")
logger.info(
f"Pulling {'main' if dataset_id == MAIN_DATASET_ID else 'preliminary'} data from Socrata API",
dataset_id=dataset_id,
)
results = []
offset = 0
limit = 50000 # maximum limit allowed by SODA 2.0
Expand All @@ -80,7 +83,8 @@ def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger):

if results:
df = pd.DataFrame.from_records(results)
create_backup_csv(df, backup_dir, False, logger=logger)
sensor = "prelim" if dataset_id == PRELIM_DATASET_ID else None
create_backup_csv(df, backup_dir, False, sensor=sensor, logger=logger)
else:
df = pd.DataFrame()
return df
Expand Down Expand Up @@ -120,6 +124,7 @@ def pull_nhsn_data(
backup_dir: str,
custom_run: bool,
issue_date: Optional[str],
preliminary: bool = False,
logger: Optional[logging.Logger] = None,
):
"""Pull the latest NHSN hospital admission data, and conforms it into a dataset.
Expand All @@ -137,6 +142,10 @@ def pull_nhsn_data(
Directory to which to save raw backup data
custom_run: bool
Flag indicating if the current run is a patch. If so, don't save any data to disk
preliminary: bool
Flag indicating if the grabbing main or preliminary data
issue_date:
date to indicate which backup file to pull for patching
logger: Optional[logging.Logger]
logger object

Expand All @@ -145,22 +154,26 @@ def pull_nhsn_data(
pd.DataFrame
Dataframe as described above.
"""
dataset_id = PRELIM_DATASET_ID if preliminary else MAIN_DATASET_ID
# Pull data from Socrata API
df = (
pull_data(socrata_token, MAIN_DATASET_ID, backup_dir, logger)
pull_data(socrata_token, dataset_id, backup_dir, logger)
if not custom_run
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=False)
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=preliminary)
)

recently_updated = True if custom_run else check_last_updated(socrata_token, MAIN_DATASET_ID, logger)
recently_updated = True if custom_run else check_last_updated(socrata_token, dataset_id, logger)

type_dict = PRELIM_TYPE_DICT if preliminary else TYPE_DICT
keep_columns = list(type_dict.keys())
filtered_type_dict = copy.deepcopy(type_dict)

keep_columns = list(TYPE_DICT.keys())
signal_map = PRELIM_SIGNALS_MAP if preliminary else SIGNALS_MAP

if not df.empty and recently_updated:
df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"})
filtered_type_dict = copy.deepcopy(TYPE_DICT)

for signal, col_name in SIGNALS_MAP.items():
for signal, col_name in signal_map.items():
# older backups don't have certain columns
try:
df[signal] = df[col_name]
Expand All @@ -178,66 +191,3 @@ def pull_nhsn_data(
df = pd.DataFrame(columns=keep_columns)

return df


def pull_preliminary_nhsn_data(
socrata_token: str,
backup_dir: str,
custom_run: bool,
issue_date: Optional[str],
logger: Optional[logging.Logger] = None,
):
"""Pull the latest preliminary NHSN hospital admission data, and conforms it into a dataset.

The output dataset has:

- Each row corresponds to a single observation
- Each row additionally has columns for the signals in SIGNALS

Parameters
----------
socrata_token: str
My App Token for pulling the NHSN data
backup_dir: str
Directory to which to save raw backup data
custom_run: bool
Flag indicating if the current run is a patch. If so, don't save any data to disk
logger: Optional[logging.Logger]
logger object

Returns
-------
pd.DataFrame
Dataframe as described above.
"""
# Pull data from Socrata API
df = (
pull_data(socrata_token, PRELIM_DATASET_ID, backup_dir, logger)
if not custom_run
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=True)
)

keep_columns = list(PRELIM_TYPE_DICT.keys())
recently_updated = True if custom_run else check_last_updated(socrata_token, PRELIM_DATASET_ID, logger)

if not df.empty and recently_updated:
df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"})
filtered_type_dict = copy.deepcopy(PRELIM_TYPE_DICT)

for signal, col_name in PRELIM_SIGNALS_MAP.items():
try:
df[signal] = df[col_name]
except KeyError:
logger.info("column not available in data", col_name=col_name, signal=signal)
keep_columns.remove(signal)
del filtered_type_dict[signal]

df = df[keep_columns]
df = df.astype(filtered_type_dict)

df["geo_id"] = df["geo_id"].str.lower()
df.loc[df["geo_id"] == "usa", "geo_id"] = "us"
else:
df = pd.DataFrame(columns=keep_columns)

return df
7 changes: 4 additions & 3 deletions nhsn/delphi_nhsn/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from delphi_utils.export import create_export_csv

from .constants import GEOS, PRELIM_SIGNALS_MAP, SIGNALS_MAP
from .pull import pull_nhsn_data, pull_preliminary_nhsn_data
from .pull import pull_nhsn_data


def run_module(params, logger=None):
Expand Down Expand Up @@ -56,8 +56,8 @@ def run_module(params, logger=None):
export_start_date = export_start_date.strftime("%Y-%m-%d")

nhsn_df = pull_nhsn_data(socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger)
preliminary_nhsn_df = pull_preliminary_nhsn_data(
socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger
preliminary_nhsn_df = pull_nhsn_data(
socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger, preliminary=True
)

geo_mapper = GeoMapper()
Expand Down Expand Up @@ -92,6 +92,7 @@ def run_module(params, logger=None):

df["se"] = np.nan
df["sample_size"] = np.nan

dates = create_export_csv(
df,
geo_res=geo,
Expand Down
Binary file modified nhsn/tests/test_data/20241212.csv.gz
Binary file not shown.
Binary file modified nhsn/tests/test_data/20241212_prelim.csv.gz
Binary file not shown.
21 changes: 21 additions & 0 deletions nhsn/tests/test_data/expected_df.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
timestamp,geo_id,confirmed_admissions_covid_ew,confirmed_admissions_flu_ew,confirmed_admissions_rsv_ew,hosprep_confirmed_admissions_covid_ew,hosprep_confirmed_admissions_flu_ew,hosprep_confirmed_admissions_rsv_ew
2021-08-21,md,53.0,2.0,0.0,13.0,13.0,1.0
2021-08-21,co,852.0,0.0,,92.0,78.0,0.0
2021-08-21,us,10384.0,6049.0,84.0,5426.0,5426.0,469.0
2021-08-28,co,835.0,1.0,,92.0,78.0,0.0
2021-08-28,us,94596.0,262.0,,5391.0,4397.0,0.0
2021-09-04,co,1000.0,3.0,,92.0,78.0,0.0
2021-09-04,us,93241.0,282.0,,5392.0,4396.0,0.0
2021-09-11,co,982.0,2.0,,92.0,78.0,0.0
2021-09-11,us,88162.0,247.0,,5391.0,4377.0,0.0
2021-09-18,co,955.0,0.0,,92.0,78.0,0.0
2021-09-18,us,79169.0,261.0,,5394.0,4362.0,0.0
2021-09-25,co,993.0,0.0,,92.0,78.0,0.0
2021-09-25,us,67740.0,234.0,,5393.0,4368.0,0.0
2021-10-02,co,970.0,0.0,,92.0,78.0,0.0
2021-10-02,us,58076.0,253.0,,5395.0,4391.0,0.0
2021-10-09,co,1079.0,1.0,,92.0,78.0,0.0
2021-10-09,us,51744.0,341.0,,5396.0,4379.0,0.0
2021-10-16,co,1231.0,0.0,,92.0,78.0,0.0
2021-10-16,us,45978.0,266.0,,5394.0,4307.0,0.0
2021-10-16,region 1,45978.0,266.0,,5394.0,4307.0,0.0
20 changes: 20 additions & 0 deletions nhsn/tests/test_data/expected_df_prelim.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
timestamp,geo_id,confirmed_admissions_covid_ew_prelim,confirmed_admissions_flu_ew_prelim,confirmed_admissions_rsv_ew_prelim,hosprep_confirmed_admissions_covid_ew_prelim,hosprep_confirmed_admissions_flu_ew_prelim,hosprep_confirmed_admissions_rsv_ew_prelim
2021-08-21,mi,269.0,523.0,1.0,152.0,152.0,4.0
2021-08-21,co,852.0,0.0,,92.0,78.0,0.0
2021-08-21,us,8946.0,5576.0,61.0,5422.0,5422.0,485.0
2021-08-28,co,835.0,1.0,,92.0,78.0,0.0
2021-08-28,us,94596.0,262.0,,5391.0,4397.0,0.0
2021-09-04,co,1000.0,3.0,,92.0,78.0,0.0
2021-09-04,us,93241.0,282.0,,5392.0,4396.0,0.0
2021-09-11,co,982.0,2.0,,92.0,78.0,0.0
2021-09-11,us,88162.0,247.0,,5391.0,4377.0,0.0
2021-09-18,co,955.0,0.0,,92.0,78.0,0.0
2021-09-18,us,79169.0,261.0,,5394.0,4362.0,0.0
2021-09-25,co,993.0,0.0,,92.0,78.0,0.0
2021-09-25,us,67740.0,234.0,,5393.0,4368.0,0.0
2021-10-02,co,970.0,0.0,,92.0,78.0,0.0
2021-10-02,us,58076.0,253.0,,5395.0,4391.0,0.0
2021-10-09,co,1079.0,1.0,,92.0,78.0,0.0
2021-10-09,us,51744.0,341.0,,5396.0,4379.0,0.0
2021-10-16,co,1231.0,0.0,,92.0,78.0,0.0
2021-10-16,us,45978.0,266.0,,5394.0,4307.0,0.0
Loading
Loading