Skip to content

Commit 70abef5

Browse files
aysim319nmdefries
andauthored
first implimentation of rsv (#2121)
* first implimentation of rsv * lint * clean up and fix tests * more logging and lint * suggested changes and refactor pull_nhsn_data * linting * trailing comma * fixing backup overwrite --------- Co-authored-by: nmdefries <[email protected]>
1 parent d0a6393 commit 70abef5

11 files changed

+753
-394
lines changed

nhsn/delphi_nhsn/constants.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,29 +8,36 @@
88
# column name from socrata
99
TOTAL_ADMISSION_COVID_COL = "totalconfc19newadm"
1010
TOTAL_ADMISSION_FLU_COL = "totalconfflunewadm"
11+
TOTAL_ADMISSION_RSV_COL = "totalconfrsvnewadm"
1112
NUM_HOSP_REPORTING_COVID_COL = "totalconfc19newadmhosprep"
1213
NUM_HOSP_REPORTING_FLU_COL = "totalconfflunewadmhosprep"
13-
14+
NUM_HOSP_REPORTING_RSV_COL = "totalconfrsvnewadmhosprep"
1415
# signal name
1516
TOTAL_ADMISSION_COVID = "confirmed_admissions_covid_ew"
1617
TOTAL_ADMISSION_FLU = "confirmed_admissions_flu_ew"
18+
TOTAL_ADMISSION_RSV = "confirmed_admissions_rsv_ew"
1719
NUM_HOSP_REPORTING_COVID = "hosprep_confirmed_admissions_covid_ew"
1820
NUM_HOSP_REPORTING_FLU = "hosprep_confirmed_admissions_flu_ew"
21+
NUM_HOSP_REPORTING_RSV = "hosprep_confirmed_admissions_rsv_ew"
1922

2023
SIGNALS_MAP = {
2124
TOTAL_ADMISSION_COVID: TOTAL_ADMISSION_COVID_COL,
2225
TOTAL_ADMISSION_FLU: TOTAL_ADMISSION_FLU_COL,
26+
TOTAL_ADMISSION_RSV: TOTAL_ADMISSION_RSV_COL,
2327
NUM_HOSP_REPORTING_COVID: NUM_HOSP_REPORTING_COVID_COL,
2428
NUM_HOSP_REPORTING_FLU: NUM_HOSP_REPORTING_FLU_COL,
29+
NUM_HOSP_REPORTING_RSV: NUM_HOSP_REPORTING_RSV_COL,
2530
}
2631

2732
TYPE_DICT = {
2833
"timestamp": "datetime64[ns]",
2934
"geo_id": str,
3035
TOTAL_ADMISSION_COVID: float,
3136
TOTAL_ADMISSION_FLU: float,
37+
TOTAL_ADMISSION_RSV: float,
3238
NUM_HOSP_REPORTING_COVID: float,
3339
NUM_HOSP_REPORTING_FLU: float,
40+
NUM_HOSP_REPORTING_RSV: float,
3441
}
3542

3643
# signal mapping for secondary, preliminary source
@@ -39,15 +46,19 @@
3946
PRELIM_SIGNALS_MAP = {
4047
f"{TOTAL_ADMISSION_COVID}_prelim": TOTAL_ADMISSION_COVID_COL,
4148
f"{TOTAL_ADMISSION_FLU}_prelim": TOTAL_ADMISSION_FLU_COL,
49+
f"{TOTAL_ADMISSION_RSV}_prelim": TOTAL_ADMISSION_RSV_COL,
4250
f"{NUM_HOSP_REPORTING_COVID}_prelim": NUM_HOSP_REPORTING_COVID_COL,
4351
f"{NUM_HOSP_REPORTING_FLU}_prelim": NUM_HOSP_REPORTING_FLU_COL,
52+
f"{NUM_HOSP_REPORTING_RSV}_prelim": NUM_HOSP_REPORTING_RSV_COL,
4453
}
4554

4655
PRELIM_TYPE_DICT = {
4756
"timestamp": "datetime64[ns]",
4857
"geo_id": str,
4958
f"{TOTAL_ADMISSION_COVID}_prelim": float,
5059
f"{TOTAL_ADMISSION_FLU}_prelim": float,
60+
f"{TOTAL_ADMISSION_RSV}_prelim": float,
5161
f"{NUM_HOSP_REPORTING_COVID}_prelim": float,
5262
f"{NUM_HOSP_REPORTING_FLU}_prelim": float,
63+
f"{NUM_HOSP_REPORTING_RSV}_prelim": float,
5364
}

nhsn/delphi_nhsn/pull.py

Lines changed: 21 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,10 @@ def check_last_updated(socrata_token, dataset_id, logger):
5858
def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger):
5959
"""Pull data from Socrata API."""
6060
client = Socrata("data.cdc.gov", socrata_token)
61-
logger.info("Pulling data from Socrata API")
61+
logger.info(
62+
f"Pulling {'main' if dataset_id == MAIN_DATASET_ID else 'preliminary'} data from Socrata API",
63+
dataset_id=dataset_id,
64+
)
6265
results = []
6366
offset = 0
6467
limit = 50000 # maximum limit allowed by SODA 2.0
@@ -80,7 +83,8 @@ def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger):
8083

8184
if results:
8285
df = pd.DataFrame.from_records(results)
83-
create_backup_csv(df, backup_dir, False, logger=logger)
86+
sensor = "prelim" if dataset_id == PRELIM_DATASET_ID else None
87+
create_backup_csv(df, backup_dir, False, sensor=sensor, logger=logger)
8488
else:
8589
df = pd.DataFrame()
8690
return df
@@ -120,6 +124,7 @@ def pull_nhsn_data(
120124
backup_dir: str,
121125
custom_run: bool,
122126
issue_date: Optional[str],
127+
preliminary: bool = False,
123128
logger: Optional[logging.Logger] = None,
124129
):
125130
"""Pull the latest NHSN hospital admission data, and conforms it into a dataset.
@@ -137,6 +142,10 @@ def pull_nhsn_data(
137142
Directory to which to save raw backup data
138143
custom_run: bool
139144
Flag indicating if the current run is a patch. If so, don't save any data to disk
145+
preliminary: bool
146+
Flag indicating if the grabbing main or preliminary data
147+
issue_date:
148+
date to indicate which backup file to pull for patching
140149
logger: Optional[logging.Logger]
141150
logger object
142151
@@ -145,22 +154,26 @@ def pull_nhsn_data(
145154
pd.DataFrame
146155
Dataframe as described above.
147156
"""
157+
dataset_id = PRELIM_DATASET_ID if preliminary else MAIN_DATASET_ID
148158
# Pull data from Socrata API
149159
df = (
150-
pull_data(socrata_token, MAIN_DATASET_ID, backup_dir, logger)
160+
pull_data(socrata_token, dataset_id, backup_dir, logger)
151161
if not custom_run
152-
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=False)
162+
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=preliminary)
153163
)
154164

155-
recently_updated = True if custom_run else check_last_updated(socrata_token, MAIN_DATASET_ID, logger)
165+
recently_updated = True if custom_run else check_last_updated(socrata_token, dataset_id, logger)
166+
167+
type_dict = PRELIM_TYPE_DICT if preliminary else TYPE_DICT
168+
keep_columns = list(type_dict.keys())
169+
filtered_type_dict = copy.deepcopy(type_dict)
156170

157-
keep_columns = list(TYPE_DICT.keys())
171+
signal_map = PRELIM_SIGNALS_MAP if preliminary else SIGNALS_MAP
158172

159173
if not df.empty and recently_updated:
160174
df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"})
161-
filtered_type_dict = copy.deepcopy(TYPE_DICT)
162175

163-
for signal, col_name in SIGNALS_MAP.items():
176+
for signal, col_name in signal_map.items():
164177
# older backups don't have certain columns
165178
try:
166179
df[signal] = df[col_name]
@@ -178,66 +191,3 @@ def pull_nhsn_data(
178191
df = pd.DataFrame(columns=keep_columns)
179192

180193
return df
181-
182-
183-
def pull_preliminary_nhsn_data(
184-
socrata_token: str,
185-
backup_dir: str,
186-
custom_run: bool,
187-
issue_date: Optional[str],
188-
logger: Optional[logging.Logger] = None,
189-
):
190-
"""Pull the latest preliminary NHSN hospital admission data, and conforms it into a dataset.
191-
192-
The output dataset has:
193-
194-
- Each row corresponds to a single observation
195-
- Each row additionally has columns for the signals in SIGNALS
196-
197-
Parameters
198-
----------
199-
socrata_token: str
200-
My App Token for pulling the NHSN data
201-
backup_dir: str
202-
Directory to which to save raw backup data
203-
custom_run: bool
204-
Flag indicating if the current run is a patch. If so, don't save any data to disk
205-
logger: Optional[logging.Logger]
206-
logger object
207-
208-
Returns
209-
-------
210-
pd.DataFrame
211-
Dataframe as described above.
212-
"""
213-
# Pull data from Socrata API
214-
df = (
215-
pull_data(socrata_token, PRELIM_DATASET_ID, backup_dir, logger)
216-
if not custom_run
217-
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=True)
218-
)
219-
220-
keep_columns = list(PRELIM_TYPE_DICT.keys())
221-
recently_updated = True if custom_run else check_last_updated(socrata_token, PRELIM_DATASET_ID, logger)
222-
223-
if not df.empty and recently_updated:
224-
df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"})
225-
filtered_type_dict = copy.deepcopy(PRELIM_TYPE_DICT)
226-
227-
for signal, col_name in PRELIM_SIGNALS_MAP.items():
228-
try:
229-
df[signal] = df[col_name]
230-
except KeyError:
231-
logger.info("column not available in data", col_name=col_name, signal=signal)
232-
keep_columns.remove(signal)
233-
del filtered_type_dict[signal]
234-
235-
df = df[keep_columns]
236-
df = df.astype(filtered_type_dict)
237-
238-
df["geo_id"] = df["geo_id"].str.lower()
239-
df.loc[df["geo_id"] == "usa", "geo_id"] = "us"
240-
else:
241-
df = pd.DataFrame(columns=keep_columns)
242-
243-
return df

nhsn/delphi_nhsn/run.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from delphi_utils.export import create_export_csv
2525

2626
from .constants import GEOS, PRELIM_SIGNALS_MAP, SIGNALS_MAP
27-
from .pull import pull_nhsn_data, pull_preliminary_nhsn_data
27+
from .pull import pull_nhsn_data
2828

2929

3030
def run_module(params, logger=None):
@@ -56,8 +56,8 @@ def run_module(params, logger=None):
5656
export_start_date = export_start_date.strftime("%Y-%m-%d")
5757

5858
nhsn_df = pull_nhsn_data(socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger)
59-
preliminary_nhsn_df = pull_preliminary_nhsn_data(
60-
socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger
59+
preliminary_nhsn_df = pull_nhsn_data(
60+
socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger, preliminary=True
6161
)
6262

6363
geo_mapper = GeoMapper()
@@ -92,6 +92,7 @@ def run_module(params, logger=None):
9292

9393
df["se"] = np.nan
9494
df["sample_size"] = np.nan
95+
9596
dates = create_export_csv(
9697
df,
9798
geo_res=geo,

nhsn/tests/test_data/20241212.csv.gz

616 Bytes
Binary file not shown.
663 Bytes
Binary file not shown.

nhsn/tests/test_data/expected_df.csv

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
timestamp,geo_id,confirmed_admissions_covid_ew,confirmed_admissions_flu_ew,confirmed_admissions_rsv_ew,hosprep_confirmed_admissions_covid_ew,hosprep_confirmed_admissions_flu_ew,hosprep_confirmed_admissions_rsv_ew
2+
2021-08-21,md,53.0,2.0,0.0,13.0,13.0,1.0
3+
2021-08-21,co,852.0,0.0,,92.0,78.0,0.0
4+
2021-08-21,us,10384.0,6049.0,84.0,5426.0,5426.0,469.0
5+
2021-08-28,co,835.0,1.0,,92.0,78.0,0.0
6+
2021-08-28,us,94596.0,262.0,,5391.0,4397.0,0.0
7+
2021-09-04,co,1000.0,3.0,,92.0,78.0,0.0
8+
2021-09-04,us,93241.0,282.0,,5392.0,4396.0,0.0
9+
2021-09-11,co,982.0,2.0,,92.0,78.0,0.0
10+
2021-09-11,us,88162.0,247.0,,5391.0,4377.0,0.0
11+
2021-09-18,co,955.0,0.0,,92.0,78.0,0.0
12+
2021-09-18,us,79169.0,261.0,,5394.0,4362.0,0.0
13+
2021-09-25,co,993.0,0.0,,92.0,78.0,0.0
14+
2021-09-25,us,67740.0,234.0,,5393.0,4368.0,0.0
15+
2021-10-02,co,970.0,0.0,,92.0,78.0,0.0
16+
2021-10-02,us,58076.0,253.0,,5395.0,4391.0,0.0
17+
2021-10-09,co,1079.0,1.0,,92.0,78.0,0.0
18+
2021-10-09,us,51744.0,341.0,,5396.0,4379.0,0.0
19+
2021-10-16,co,1231.0,0.0,,92.0,78.0,0.0
20+
2021-10-16,us,45978.0,266.0,,5394.0,4307.0,0.0
21+
2021-10-16,region 1,45978.0,266.0,,5394.0,4307.0,0.0
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
timestamp,geo_id,confirmed_admissions_covid_ew_prelim,confirmed_admissions_flu_ew_prelim,confirmed_admissions_rsv_ew_prelim,hosprep_confirmed_admissions_covid_ew_prelim,hosprep_confirmed_admissions_flu_ew_prelim,hosprep_confirmed_admissions_rsv_ew_prelim
2+
2021-08-21,mi,269.0,523.0,1.0,152.0,152.0,4.0
3+
2021-08-21,co,852.0,0.0,,92.0,78.0,0.0
4+
2021-08-21,us,8946.0,5576.0,61.0,5422.0,5422.0,485.0
5+
2021-08-28,co,835.0,1.0,,92.0,78.0,0.0
6+
2021-08-28,us,94596.0,262.0,,5391.0,4397.0,0.0
7+
2021-09-04,co,1000.0,3.0,,92.0,78.0,0.0
8+
2021-09-04,us,93241.0,282.0,,5392.0,4396.0,0.0
9+
2021-09-11,co,982.0,2.0,,92.0,78.0,0.0
10+
2021-09-11,us,88162.0,247.0,,5391.0,4377.0,0.0
11+
2021-09-18,co,955.0,0.0,,92.0,78.0,0.0
12+
2021-09-18,us,79169.0,261.0,,5394.0,4362.0,0.0
13+
2021-09-25,co,993.0,0.0,,92.0,78.0,0.0
14+
2021-09-25,us,67740.0,234.0,,5393.0,4368.0,0.0
15+
2021-10-02,co,970.0,0.0,,92.0,78.0,0.0
16+
2021-10-02,us,58076.0,253.0,,5395.0,4391.0,0.0
17+
2021-10-09,co,1079.0,1.0,,92.0,78.0,0.0
18+
2021-10-09,us,51744.0,341.0,,5396.0,4379.0,0.0
19+
2021-10-16,co,1231.0,0.0,,92.0,78.0,0.0
20+
2021-10-16,us,45978.0,266.0,,5394.0,4307.0,0.0

0 commit comments

Comments
 (0)