From 44cd861107cc4a576fa76cb2d8fd8ecfb36e058b Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 19 Mar 2021 17:40:53 -0700 Subject: [PATCH] Nans combo: * add missing columns and tests --- .../delphi_combo_cases_and_deaths/run.py | 30 +++++++++++++++++-- combo_cases_and_deaths/tests/test_run.py | 18 +++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py index d2d1229a9..c1149169d 100755 --- a/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py +++ b/combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py @@ -15,7 +15,7 @@ import covidcast import pandas as pd -from delphi_utils import add_prefix, get_structured_logger +from delphi_utils import add_prefix, get_structured_logger, Nans from delphi_utils.geomap import GeoMapper from .constants import METRICS, SMOOTH_TYPES, SENSORS, GEO_RESOLUTIONS @@ -292,6 +292,25 @@ def configure_range(params, range_param, yesterday, next_day): date1 = params['indicator']['export_start_date'] params['indicator'][range_param] = [date1, date2] +def add_nancodes(df): + """Add nancodes to the dataframe. + + se and sample_size should already be nan and NOT_APPLICABLE, inheriting from USAFacts + and JHU. Due to the geo aggregation, the missingness codes will get mixed up among rows. + So for the time being, we use only one missing code (UNKNOWN) for nan values in the val + column. + """ + # Default missingness codes + df["missing_val"] = Nans.NOT_MISSING + df["missing_se"] = Nans.NOT_APPLICABLE + df["missing_sample_size"] = Nans.NOT_APPLICABLE + + # Missing codes for `val` + missing_mask = df["val"].isnull() + df.loc[missing_mask, "missing_val"] = Nans.OTHER + + return df + def run_module(params): """ Produce a combined cases and deaths signal using data from JHU and USA Facts. @@ -332,7 +351,7 @@ def run_module(params): extend_raw_date_range(params, sensor_name), logger, params['indicator']['issue_range']) - df["timestamp"] = pd.to_datetime(df["timestamp"]) + df = add_nancodes(df) start_date = pd.to_datetime(params['indicator']['export_start_date']) export_dir = params["common"]["export_dir"] dates = pd.Series( @@ -344,7 +363,12 @@ def run_module(params): prefix="wip_") for date_ in dates: export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv' - df[df["timestamp"] == date_][["geo_id", "val", "se", "sample_size", ]].to_csv( + date_mask = (df["timestamp"] == date_) + columns_to_write = [ + "geo_id", "val", "se", "sample_size", + "missing_val", "missing_se", "missing_sample_size" + ] + df.loc[date_mask, columns_to_write].to_csv( f"{export_dir}/{export_fn}", index=False, na_rep="NA" ) diff --git a/combo_cases_and_deaths/tests/test_run.py b/combo_cases_and_deaths/tests/test_run.py index c799b6ed3..d2dd1afd3 100644 --- a/combo_cases_and_deaths/tests/test_run.py +++ b/combo_cases_and_deaths/tests/test_run.py @@ -11,6 +11,8 @@ from delphi_combo_cases_and_deaths.run import ( run_module, extend_raw_date_range, + add_nancodes, + extend_raw_date_range, get_updated_dates, sensor_signal, combine_usafacts_and_jhu, @@ -18,6 +20,7 @@ COLUMN_MAPPING) from delphi_combo_cases_and_deaths.constants import METRICS, SMOOTH_TYPES, SENSORS from delphi_utils.geomap import GeoMapper +from delphi_utils import Nans TEST_LOGGER = logging.getLogger() @@ -301,5 +304,20 @@ def test_output_files(mock_combine): expected_files += [date + "_" + geo + "_" + metric + ".csv"] assert set(csv_files) == set(expected_files) +def test_add_nancodes(): + df = pd.DataFrame({"geo_id": ["01000", "01001", "01001"], + "val": [50, 100, None], + "timestamp": [20200101, 20200101, 20200101]}) + expected_df = pd.DataFrame({"geo_id": ["01000", "01001", "01001"], + "val": [50, 100, None], + "timestamp": [20200101, 20200101, 20200101], + "missing_val": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER], + "missing_se": [Nans.NOT_APPLICABLE] * 3, + "missing_sample_size": [Nans.NOT_APPLICABLE] * 3 + }) + df = add_nancodes(df) + pd.testing.assert_frame_equal(df, expected_df) + + if __name__ == '__main__': unittest.main()