diff --git a/hhs_facilities/delphi_hhs_facilities/constants.py b/hhs_facilities/delphi_hhs_facilities/constants.py index 7880a6608..25b7ed951 100644 --- a/hhs_facilities/delphi_hhs_facilities/constants.py +++ b/hhs_facilities/delphi_hhs_facilities/constants.py @@ -1,7 +1,12 @@ """Registry for signals and geographies to process.""" +from numpy import nan from .generate_signals import sum_cols -NAN_VALUES = [None, -999999, -999999.0] +NAN_VALUES = { + None: nan, + -999999: 1.5, # -999,999 represents the data range [0-3], so we use the range mean + -999999.0: 1.5 +} CONFIRMED_ADMISSIONS = "confirmed_admissions_7d" CONFIRMED_SUSPECTED_ADMISSIONS = "sum_confirmed_suspected_admissions_7d" diff --git a/hhs_facilities/delphi_hhs_facilities/generate_signals.py b/hhs_facilities/delphi_hhs_facilities/generate_signals.py index 3460cce46..65ee91692 100644 --- a/hhs_facilities/delphi_hhs_facilities/generate_signals.py +++ b/hhs_facilities/delphi_hhs_facilities/generate_signals.py @@ -5,6 +5,20 @@ import pandas as pd import numpy as np +from delphi_utils import Nans + + +def add_nancodes(df): + """Add nancodes to a signal dataframe.""" + # Default missingness codes + df["missing_val"] = Nans.NOT_MISSING + df["missing_se"] = Nans.NOT_APPLICABLE + df["missing_sample_size"] = Nans.NOT_APPLICABLE + + # Mark any remaining nans with unknown + remaining_nans_mask = df["val"].isnull() + df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER + return df def generate_signal(df: pd.DataFrame, input_cols: list, @@ -34,10 +48,13 @@ def generate_signal(df: pd.DataFrame, df_cols = [df[i] for i in input_cols] df["val"] = signal_func(df_cols) df["timestamp"] = df["timestamp"] + pd.Timedelta(days=date_offset) - df.dropna(subset=["val"], inplace=True) - df = df.groupby(["timestamp", "geo_id"], as_index=False).sum() + df = df.groupby(["timestamp", "geo_id"], as_index=False).sum(min_count=1) df["se"] = df["sample_size"] = np.nan - return df[["timestamp", "geo_id", "val", "se", "sample_size"]] + df = add_nancodes(df) + export_columns = [ + "timestamp", "geo_id", "val", "se", "sample_size", + "missing_val", "missing_se", "missing_sample_size"] + return df[export_columns] def sum_cols(cols: list) -> pd.Series: diff --git a/hhs_facilities/delphi_hhs_facilities/pull.py b/hhs_facilities/delphi_hhs_facilities/pull.py index 0e3851c0d..ff4303b6c 100644 --- a/hhs_facilities/delphi_hhs_facilities/pull.py +++ b/hhs_facilities/delphi_hhs_facilities/pull.py @@ -3,7 +3,6 @@ from datetime import date import pandas as pd -import numpy as np from delphi_utils.geomap import GeoMapper from delphi_epidata import Epidata @@ -54,6 +53,6 @@ def pull_data() -> pd.DataFrame: past_reference_day = int(date(2020, 1, 1).strftime("%Y%m%d")) # first available date in DB all_states = GeoMapper().get_geo_values("state_id") responses = pull_data_iteratively(all_states, Epidata.range(past_reference_day, today)) - all_columns = pd.DataFrame(responses).replace(NAN_VALUES, np.nan) + all_columns = pd.DataFrame(responses).replace(NAN_VALUES) all_columns["timestamp"] = pd.to_datetime(all_columns["collection_week"], format="%Y%m%d") return all_columns diff --git a/hhs_facilities/tests/expected/20200131_county_confirmed_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_county_confirmed_admissions_7d.csv index 6655255d5..0745fb95b 100644 --- a/hhs_facilities/tests/expected/20200131_county_confirmed_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_county_confirmed_admissions_7d.csv @@ -1,4 +1,4 @@ -geo_id,val,se,sample_size -25013,33.0,NA,NA -72001,76.56462035541196,NA,NA -72141,0.4353796445880453,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +25013,33.00000000,NA,NA,0,1,1 +72001,76.56462040,NA,NA,0,1,1 +72141,0.43537960,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_county_sum_confirmed_suspected_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_county_sum_confirmed_suspected_admissions_7d.csv index 8fa4aa846..b3fb890f6 100644 --- a/hhs_facilities/tests/expected/20200131_county_sum_confirmed_suspected_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_county_sum_confirmed_suspected_admissions_7d.csv @@ -1,4 +1,4 @@ -geo_id,val,se,sample_size -25013,98.0,NA,NA -72001,161.08400646203557,NA,NA -72141,0.9159935379644588,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +25013,98.00000000,NA,NA,0,1,1 +72001,161.08400650,NA,NA,0,1,1 +72141,0.91599350,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_hrr_confirmed_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_hrr_confirmed_admissions_7d.csv index 93528fbba..9bcb23dfd 100644 --- a/hhs_facilities/tests/expected/20200131_hrr_confirmed_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_hrr_confirmed_admissions_7d.csv @@ -1,2 +1,2 @@ -geo_id,val,se,sample_size -230,33.0,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +230,33.00000000,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_hrr_sum_confirmed_suspected_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_hrr_sum_confirmed_suspected_admissions_7d.csv index 1ae99e5c7..954b8d9eb 100644 --- a/hhs_facilities/tests/expected/20200131_hrr_sum_confirmed_suspected_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_hrr_sum_confirmed_suspected_admissions_7d.csv @@ -1,2 +1,2 @@ -geo_id,val,se,sample_size -230,98.0,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +230,98.00000000,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_msa_confirmed_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_msa_confirmed_admissions_7d.csv index efbe376d8..ff49eb1bf 100644 --- a/hhs_facilities/tests/expected/20200131_msa_confirmed_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_msa_confirmed_admissions_7d.csv @@ -1,4 +1,4 @@ -geo_id,val,se,sample_size -10380,0.4353796445880453,NA,NA -38660,76.56462035541196,NA,NA -44140,33.0,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +10380,0.43537960,NA,NA,0,1,1 +38660,76.56462040,NA,NA,0,1,1 +44140,33.00000000,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_msa_sum_confirmed_suspected_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_msa_sum_confirmed_suspected_admissions_7d.csv index f71d79a72..5825ae148 100644 --- a/hhs_facilities/tests/expected/20200131_msa_sum_confirmed_suspected_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_msa_sum_confirmed_suspected_admissions_7d.csv @@ -1,4 +1,4 @@ -geo_id,val,se,sample_size -10380,0.9159935379644588,NA,NA -38660,161.08400646203557,NA,NA -44140,98.0,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +10380,0.91599350,NA,NA,0,1,1 +38660,161.08400650,NA,NA,0,1,1 +44140,98.00000000,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_state_confirmed_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_state_confirmed_admissions_7d.csv index cd00b7db6..bcbf93e6a 100644 --- a/hhs_facilities/tests/expected/20200131_state_confirmed_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_state_confirmed_admissions_7d.csv @@ -1,3 +1,3 @@ -geo_id,val,se,sample_size -AL,33.0,NA,NA -PR,33.0,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +AL,33.00000000,NA,NA,0,1,1 +PR,33.00000000,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/expected/20200131_state_sum_confirmed_suspected_admissions_7d.csv b/hhs_facilities/tests/expected/20200131_state_sum_confirmed_suspected_admissions_7d.csv index c4513d920..601f55748 100644 --- a/hhs_facilities/tests/expected/20200131_state_sum_confirmed_suspected_admissions_7d.csv +++ b/hhs_facilities/tests/expected/20200131_state_sum_confirmed_suspected_admissions_7d.csv @@ -1,3 +1,3 @@ -geo_id,val,se,sample_size -AL,98.0,NA,NA -PR,48.0,NA,NA +geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size +AL,98.00000000,NA,NA,0,1,1 +PR,48.00000000,NA,NA,0,1,1 diff --git a/hhs_facilities/tests/test_generate_signals.py b/hhs_facilities/tests/test_generate_signals.py index 5d5b84d1a..ef2ec42d1 100644 --- a/hhs_facilities/tests/test_generate_signals.py +++ b/hhs_facilities/tests/test_generate_signals.py @@ -4,7 +4,7 @@ import numpy as np from delphi_hhs_facilities.generate_signals import generate_signal, sum_cols - +from delphi_utils import Nans class TestGenerateSignals: @@ -23,11 +23,15 @@ def test_generate_signals(self): expected = pd.DataFrame( {"timestamp": [pd.Timestamp("20200131"), pd.Timestamp("20200201"), - pd.Timestamp("20200202")], - "geo_id": ["x", "x", "y"], - "val": [5., 7., 10.], - "se": [np.nan]*3, - "sample_size": [np.nan]*3 + pd.Timestamp("20200202"), + pd.Timestamp("20200203")], + "geo_id": ["x", "x", "y", "z"], + "val": [5., 7., 10., np.nan], + "se": [np.nan]*4, + "sample_size": [np.nan]*4, + "missing_val": [Nans.NOT_MISSING] * 3 + [Nans.OTHER], + "missing_se": [Nans.NOT_APPLICABLE] * 4, + "missing_sample_size": [Nans.NOT_APPLICABLE] * 4, }) pd.testing.assert_frame_equal(test_output, expected) diff --git a/hhs_facilities/tests/test_pull.py b/hhs_facilities/tests/test_pull.py index 4314809a2..e951737a8 100644 --- a/hhs_facilities/tests/test_pull.py +++ b/hhs_facilities/tests/test_pull.py @@ -7,7 +7,7 @@ import numpy as np from delphi_hhs_facilities.pull import pull_data_iteratively, pull_data - +from delphi_hhs_facilities.constants import NAN_VALUES class TestPull: @@ -64,8 +64,8 @@ def test_pull_data(self, pull_data_iteratively): output, pd.DataFrame({"collection_week": [20201204.], "total_beds_7_day_sum": [2360.0], - "all_adult_hospital_beds_7_day_sum": [np.nan], - "inpatient_beds_7_day_avg": [np.nan], + "all_adult_hospital_beds_7_day_sum": [NAN_VALUES[-999999]], + "inpatient_beds_7_day_avg": [NAN_VALUES[-999999]], "total_icu_beds_7_day_avg": [np.nan], "total_staffed_adult_icu_beds_7_day_avg": [32.4], "timestamp": [pd.Timestamp("2020-12-04")]}),