Skip to content

Commit dd43fb9

Browse files
committed
NANs HHS facilities:
* add missing columns * change -999999 entries to 1.5 (average of [0-3] data range)
1 parent b390a0f commit dd43fb9

14 files changed

+75
-39
lines changed

_delphi_utils_python/delphi_utils/archive.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,16 @@ def diff_export_csv(
112112
deleted_df[["val", "se", "sample_size"]] = np.nan
113113
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
114114

115+
# Code deleted entries as nans with the deleted missing code
116+
deleted_df = before_df.loc[deleted_idx, :].copy()
117+
deleted_df[["val", "se", "sample_size"]] = np.nan
118+
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
119+
120+
# Code deleted entries as nans with the deleted missing code
121+
deleted_df = before_df.loc[deleted_idx, :].copy()
122+
deleted_df[["val", "se", "sample_size"]] = np.nan
123+
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
124+
115125
return (
116126
deleted_df,
117127
after_df_cmn.loc[~(same_mask.all(axis=1)), :],

hhs_facilities/delphi_hhs_facilities/constants.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
"""Registry for signals and geographies to process."""
22
from .generate_signals import sum_cols
3+
from numpy import nan
34

4-
NAN_VALUES = [None, -999999, -999999.0]
5+
NAN_VALUES = {
6+
None: nan,
7+
-999999: 1.5, # -999,999 represents the data range [0-3], so we use the range mean
8+
-999999.0: 1.5
9+
}
510

611
CONFIRMED_ADMISSIONS = "confirmed_admissions_7d"
712
CONFIRMED_SUSPECTED_ADMISSIONS = "sum_confirmed_suspected_admissions_7d"

hhs_facilities/delphi_hhs_facilities/generate_signals.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,20 @@
55
import pandas as pd
66
import numpy as np
77

8+
from delphi_utils import Nans
9+
10+
11+
def add_nancodes(df):
12+
"""Add nancodes to a signal dataframe."""
13+
# Default missingness codes
14+
df["missing_val"] = Nans.NOT_MISSING
15+
df["missing_se"] = Nans.NOT_APPLICABLE
16+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
17+
18+
# Mark any remaining nans with unknown
19+
remaining_nans_mask = df["val"].isnull()
20+
df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
21+
return df
822

923
def generate_signal(df: pd.DataFrame,
1024
input_cols: list,
@@ -34,10 +48,13 @@ def generate_signal(df: pd.DataFrame,
3448
df_cols = [df[i] for i in input_cols]
3549
df["val"] = signal_func(df_cols)
3650
df["timestamp"] = df["timestamp"] + pd.Timedelta(days=date_offset)
37-
df.dropna(subset=["val"], inplace=True)
38-
df = df.groupby(["timestamp", "geo_id"], as_index=False).sum()
51+
df = df.groupby(["timestamp", "geo_id"], as_index=False).sum(min_count=1)
3952
df["se"] = df["sample_size"] = np.nan
40-
return df[["timestamp", "geo_id", "val", "se", "sample_size"]]
53+
df = add_nancodes(df)
54+
export_columns = [
55+
"timestamp", "geo_id", "val", "se", "sample_size",
56+
"missing_val", "missing_se", "missing_sample_size"]
57+
return df[export_columns]
4158

4259

4360
def sum_cols(cols: list) -> pd.Series:

hhs_facilities/delphi_hhs_facilities/pull.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,6 @@ def pull_data() -> pd.DataFrame:
5454
past_reference_day = int(date(2020, 1, 1).strftime("%Y%m%d")) # first available date in DB
5555
all_states = GeoMapper().get_geo_values("state_id")
5656
responses = pull_data_iteratively(all_states, Epidata.range(past_reference_day, today))
57-
all_columns = pd.DataFrame(responses).replace(NAN_VALUES, np.nan)
57+
all_columns = pd.DataFrame(responses).replace(NAN_VALUES)
5858
all_columns["timestamp"] = pd.to_datetime(all_columns["collection_week"], format="%Y%m%d")
5959
return all_columns
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
25013,33.0,NA,NA
3-
72001,76.56462035541196,NA,NA
4-
72141,0.4353796445880453,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
25013,33.00000000,NA,NA,0,1,1
3+
72001,76.56462040,NA,NA,0,1,1
4+
72141,0.43537960,NA,NA,0,1,1
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
25013,98.0,NA,NA
3-
72001,161.08400646203557,NA,NA
4-
72141,0.9159935379644588,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
25013,98.00000000,NA,NA,0,1,1
3+
72001,161.08400650,NA,NA,0,1,1
4+
72141,0.91599350,NA,NA,0,1,1
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
geo_id,val,se,sample_size
2-
230,33.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
230,33.00000000,NA,NA,0,1,1
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
geo_id,val,se,sample_size
2-
230,98.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
230,98.00000000,NA,NA,0,1,1
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
10380,0.4353796445880453,NA,NA
3-
38660,76.56462035541196,NA,NA
4-
44140,33.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
10380,0.43537960,NA,NA,0,1,1
3+
38660,76.56462040,NA,NA,0,1,1
4+
44140,33.00000000,NA,NA,0,1,1
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
10380,0.9159935379644588,NA,NA
3-
38660,161.08400646203557,NA,NA
4-
44140,98.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
10380,0.91599350,NA,NA,0,1,1
3+
38660,161.08400650,NA,NA,0,1,1
4+
44140,98.00000000,NA,NA,0,1,1
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
geo_id,val,se,sample_size
2-
AL,33.0,NA,NA
3-
PR,33.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
AL,33.00000000,NA,NA,0,1,1
3+
PR,33.00000000,NA,NA,0,1,1
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
geo_id,val,se,sample_size
2-
AL,98.0,NA,NA
3-
PR,48.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
AL,98.00000000,NA,NA,0,1,1
3+
PR,48.00000000,NA,NA,0,1,1

hhs_facilities/tests/test_generate_signals.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55

66
from delphi_hhs_facilities.generate_signals import generate_signal, sum_cols
7-
7+
from delphi_utils import Nans
88

99
class TestGenerateSignals:
1010

@@ -23,11 +23,15 @@ def test_generate_signals(self):
2323
expected = pd.DataFrame(
2424
{"timestamp": [pd.Timestamp("20200131"),
2525
pd.Timestamp("20200201"),
26-
pd.Timestamp("20200202")],
27-
"geo_id": ["x", "x", "y"],
28-
"val": [5., 7., 10.],
29-
"se": [np.nan]*3,
30-
"sample_size": [np.nan]*3
26+
pd.Timestamp("20200202"),
27+
pd.Timestamp("20200203")],
28+
"geo_id": ["x", "x", "y", "z"],
29+
"val": [5., 7., 10., np.nan],
30+
"se": [np.nan]*4,
31+
"sample_size": [np.nan]*4,
32+
"missing_val": [Nans.NOT_MISSING] * 3 + [Nans.UNKNOWN],
33+
"missing_se": [Nans.NOT_APPLICABLE] * 4,
34+
"missing_sample_size": [Nans.NOT_APPLICABLE] * 4,
3135
})
3236
pd.testing.assert_frame_equal(test_output, expected)
3337

hhs_facilities/tests/test_pull.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ def test_pull_data(self, pull_data_iteratively):
6464
output,
6565
pd.DataFrame({"collection_week": [20201204.],
6666
"total_beds_7_day_sum": [2360.0],
67-
"all_adult_hospital_beds_7_day_sum": [np.nan],
68-
"inpatient_beds_7_day_avg": [np.nan],
67+
"all_adult_hospital_beds_7_day_sum": [2.0],
68+
"inpatient_beds_7_day_avg": [2.0],
6969
"total_icu_beds_7_day_avg": [np.nan],
7070
"total_staffed_adult_icu_beds_7_day_avg": [32.4],
7171
"timestamp": [pd.Timestamp("2020-12-04")]}),

0 commit comments

Comments
 (0)