Skip to content

Commit 8113ae7

Browse files
committed
Nans combo:
* add missing columns and tests
1 parent 9b75e07 commit 8113ae7

File tree

2 files changed

+45
-4
lines changed

2 files changed

+45
-4
lines changed

combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import covidcast
1616
import pandas as pd
1717

18-
from delphi_utils import add_prefix, get_structured_logger
18+
from delphi_utils import add_prefix, get_structured_logger, Nans
1919
from delphi_utils.geomap import GeoMapper
2020
from .constants import METRICS, SMOOTH_TYPES, SENSORS, GEO_RESOLUTIONS
2121

@@ -299,6 +299,25 @@ def configure_range(params, range_param, yesterday, next_day):
299299
date1 = params['indicator']['export_start_date']
300300
params['indicator'][range_param] = [date1, date2]
301301

302+
def add_nancodes(df):
303+
"""Add nancodes to the dataframe.
304+
305+
se and sample_size should already be nan and NOT_APPLICABLE, inheriting from USAFacts
306+
and JHU. Due to the geo aggregation, the missingness codes will get mixed up among rows.
307+
So for the time being, we use only one missing code (UNKNOWN) for nan values in the val
308+
column.
309+
"""
310+
# Default missingness codes
311+
df["missing_val"] = Nans.NOT_MISSING
312+
df["missing_se"] = Nans.NOT_APPLICABLE
313+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
314+
315+
# Missing codes for `val`
316+
missing_mask = df["val"].isnull()
317+
df.loc[missing_mask, "missing_val"] = Nans.UNKNOWN
318+
319+
return df
320+
302321
def run_module(params):
303322
"""
304323
Produce a combined cases and deaths signal using data from JHU and USA Facts.
@@ -332,7 +351,7 @@ def run_module(params):
332351
geo_res,
333352
extend_raw_date_range(params, sensor_name),
334353
params['indicator']['issue_range'])
335-
df["timestamp"] = pd.to_datetime(df["timestamp"])
354+
df = add_nancodes(df)
336355
start_date = pd.to_datetime(params['indicator']['export_start_date'])
337356
export_dir = params["common"]["export_dir"]
338357
dates = pd.Series(
@@ -344,7 +363,12 @@ def run_module(params):
344363
prefix="wip_")
345364
for date_ in dates:
346365
export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv'
347-
df[df["timestamp"] == date_][["geo_id", "val", "se", "sample_size", ]].to_csv(
366+
date_mask = (df["timestamp"] == date_)
367+
columns_to_write = [
368+
"geo_id", "val", "se", "sample_size",
369+
"missing_val", "missing_se", "missing_sample_size"
370+
]
371+
df.loc[date_mask, columns_to_write].to_csv(
348372
f"{export_dir}/{export_fn}", index=False, na_rep="NA"
349373
)
350374

combo_cases_and_deaths/tests/test_run.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
import numpy as np
88

99
from delphi_combo_cases_and_deaths.run import (
10-
extend_raw_date_range, get_updated_dates,
10+
add_nancodes, extend_raw_date_range,
11+
get_updated_dates,
1112
sensor_signal,
1213
combine_usafacts_and_jhu,
1314
compute_special_geo_dfs,
1415
COLUMN_MAPPING)
1516
from delphi_combo_cases_and_deaths.constants import METRICS, SMOOTH_TYPES, SENSORS
17+
from delphi_utils import Nans
1618

1719

1820
def test_issue_dates():
@@ -245,5 +247,20 @@ def test_no_nation_jhu(mock_covidcast_signal):
245247
)
246248

247249

250+
def test_add_nancodes():
251+
df = pd.DataFrame({"geo_id": ["01000", "01001", "01001"],
252+
"val": [50, 100, None],
253+
"timestamp": [20200101, 20200101, 20200101]})
254+
expected_df = pd.DataFrame({"geo_id": ["01000", "01001", "01001"],
255+
"val": [50, 100, None],
256+
"timestamp": [20200101, 20200101, 20200101],
257+
"missing_val": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.UNKNOWN],
258+
"missing_se": [Nans.NOT_APPLICABLE] * 3,
259+
"missing_sample_size": [Nans.NOT_APPLICABLE] * 3
260+
})
261+
df = add_nancodes(df)
262+
pd.testing.assert_frame_equal(df, expected_df)
263+
264+
248265
if __name__ == '__main__':
249266
unittest.main()

0 commit comments

Comments
 (0)