15
15
import covidcast
16
16
import pandas as pd
17
17
18
- from delphi_utils import add_prefix , get_structured_logger
18
+ from delphi_utils import add_prefix , get_structured_logger , Nans
19
19
from delphi_utils .geomap import GeoMapper
20
20
from .constants import METRICS , SMOOTH_TYPES , SENSORS , GEO_RESOLUTIONS
21
21
@@ -299,6 +299,25 @@ def configure_range(params, range_param, yesterday, next_day):
299
299
date1 = params ['indicator' ]['export_start_date' ]
300
300
params ['indicator' ][range_param ] = [date1 , date2 ]
301
301
302
+ def add_nancodes (df ):
303
+ """Add nancodes to the dataframe.
304
+
305
+ se and sample_size should already be nan and NOT_APPLICABLE, inheriting from USAFacts
306
+ and JHU. Due to the geo aggregation, the missingness codes will get mixed up among rows.
307
+ So for the time being, we use only one missing code (UNKNOWN) for nan values in the val
308
+ column.
309
+ """
310
+ # Default missingness codes
311
+ df ["missing_val" ] = Nans .NOT_MISSING
312
+ df ["missing_se" ] = Nans .NOT_APPLICABLE
313
+ df ["missing_sample_size" ] = Nans .NOT_APPLICABLE
314
+
315
+ # Missing codes for `val`
316
+ missing_mask = df ["val" ].isnull ()
317
+ df .loc [missing_mask , "missing_val" ] = Nans .UNKNOWN
318
+
319
+ return df
320
+
302
321
def run_module (params ):
303
322
"""
304
323
Produce a combined cases and deaths signal using data from JHU and USA Facts.
@@ -332,7 +351,7 @@ def run_module(params):
332
351
geo_res ,
333
352
extend_raw_date_range (params , sensor_name ),
334
353
params ['indicator' ]['issue_range' ])
335
- df [ "timestamp" ] = pd . to_datetime (df [ "timestamp" ] )
354
+ df = add_nancodes (df )
336
355
start_date = pd .to_datetime (params ['indicator' ]['export_start_date' ])
337
356
export_dir = params ["common" ]["export_dir" ]
338
357
dates = pd .Series (
@@ -344,7 +363,12 @@ def run_module(params):
344
363
prefix = "wip_" )
345
364
for date_ in dates :
346
365
export_fn = f'{ date_ .strftime ("%Y%m%d" )} _{ geo_res } _{ signal_name [0 ]} .csv'
347
- df [df ["timestamp" ] == date_ ][["geo_id" , "val" , "se" , "sample_size" , ]].to_csv (
366
+ date_mask = (df ["timestamp" ] == date_ )
367
+ columns_to_write = [
368
+ "geo_id" , "val" , "se" , "sample_size" ,
369
+ "missing_val" , "missing_se" , "missing_sample_size"
370
+ ]
371
+ df .loc [date_mask , columns_to_write ].to_csv (
348
372
f"{ export_dir } /{ export_fn } " , index = False , na_rep = "NA"
349
373
)
350
374
0 commit comments