Skip to content

Commit 2263105

Browse files
committed
EMRHospSensor: added gauss_smooth, backfill uses numpy only, fit uses timeindex
1 parent 1bf25f1 commit 2263105

File tree

2 files changed

+59
-42
lines changed

2 files changed

+59
-42
lines changed

emr_hosp/delphi_emr_hosp/sensor.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ class EMRHospSensor:
2222
"""Sensor class to fit a signal using CLI counts from EMR Hospitalization data.
2323
"""
2424

25+
@staticmethod
26+
def gauss_smooth(count,total):
27+
"""smooth using the left_gauss_linear
28+
29+
Args:
30+
count, total: array
31+
"""
32+
count_smooth = left_gauss_linear(count)
33+
total_smooth = left_gauss_linear(total)
34+
total_clip = np.clip(total_smooth, 0, None)
35+
count_clip = np.clip(count_smooth, 0, total_clip)
36+
return count_clip, total_clip
37+
2538
@staticmethod
2639
def backfill(
2740
num,
@@ -37,15 +50,19 @@ def backfill(
3750
bin size so to avoid inluding long-past values.
3851
3952
Args:
40-
num: dataframe of covid counts
41-
den: dataframe of total visits
53+
num: array of covid counts
54+
den: array of total visits
4255
k: maximum number of days used to average a backfill correction
4356
min_visits_to_fill: minimum number of total visits needed in order to sum a bin
4457
4558
Returns: dataframes of adjusted covid counts, adjusted visit counts, inclusion array
4659
"""
47-
revden = den[::-1].values
48-
revnum = num[::-1].values.reshape(-1, 1)
60+
if isinstance(den,(pd.DataFrame,pd.Series)):
61+
den = den.values
62+
if isinstance(num,(pd.DataFrame,pd.Series)):
63+
num = num.values
64+
revden = den[::-1]
65+
revnum = num[::-1].reshape(-1, 1)
4966
new_num = np.full_like(revnum, np.nan, dtype=float)
5067
new_den = np.full_like(revden, np.nan, dtype=float)
5168
n, p = revnum.shape
@@ -76,20 +93,18 @@ def backfill(
7693
new_num = new_num[::-1]
7794
new_den = new_den[::-1]
7895

79-
# reset date index and format
80-
new_num = pd.Series(new_num.flatten(), name=num.name, index=num.index)
81-
new_den = pd.Series(new_den, index=den.index)
82-
8396
return new_num, new_den
8497

8598
@staticmethod
86-
def fit(y_data, first_sensor_date, geo_id):
99+
def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"):
87100
"""Fitting routine.
88101
89102
Args:
90103
y_data: dataframe for one geo_id, indexed by date
91104
first_sensor_date: datetime of first date
92105
geo_id: unique identifier for the location column
106+
num_col: str name of numerator column
107+
den_col: str name of denominator column
93108
94109
Returns:
95110
dictionary of results
@@ -99,17 +114,19 @@ def fit(y_data, first_sensor_date, geo_id):
99114
fitting_idxs = np.where(y_data.index >= first_sensor_date)[0] # JS: WILL CHANGE
100115

101116
# backfill
102-
total_counts, total_visits = EMRHospSensor.backfill(y_data["num"], y_data["den"])
117+
total_counts, total_visits = EMRHospSensor.backfill(y_data[num_col].values, y_data[den_col].values)
118+
# total_counts = pd.Series(total_counts.flatten(), name=num_col, index=y_data.index)
119+
# total_visits = pd.Series(total_visits, index=y_data.index)
103120

104121
# calculate smoothed counts and jeffreys rate
105122
# the left_gauss_linear smoother is not guaranteed to return values greater than 0
106-
smoothed_total_counts = np.clip(left_gauss_linear(total_counts.values), 0, None)
107-
smoothed_total_visits = np.clip(left_gauss_linear(total_visits.values), 0, None)
123+
124+
smoothed_total_counts, smoothed_total_visits = EMRHospSensor.gauss_smooth(total_counts.flatten(),total_visits)
108125

109126
# in smoothing, the numerator may have become more than the denominator
110127
# simple fix is to clip the max values elementwise to the denominator (note that
111128
# this has only been observed in synthetic data)
112-
smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)
129+
# smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)
113130

114131
smoothed_total_rates = (
115132
(smoothed_total_counts + 0.5) / (smoothed_total_visits + 1)
@@ -124,14 +141,12 @@ def fit(y_data, first_sensor_date, geo_id):
124141
), f"0 or negative value, {geo_id}"
125142

126143
# cut off at sensor indexes
127-
rates = smoothed_total_rates[fitting_idxs]
128-
den = smoothed_total_visits[fitting_idxs]
129-
include = den >= Config.MIN_DEN
130-
131-
# calculate standard error
132-
se = np.full_like(rates, np.nan)
133-
se[include] = np.sqrt(
134-
np.divide((rates[include] * (1 - rates[include])), den[include]))
135-
136-
logging.debug(f"{geo_id}: {rates[-1]:.3f},[{se[-1]:.3f}]")
137-
return {"geo_id": geo_id, "rate": 100 * rates, "se": 100 * se, "incl": include}
144+
rate_data = pd.DataFrame({'rate':smoothed_total_rates, 'den': smoothed_total_visits}, index=y_data.index)
145+
rate_data = rate_data[first_sensor_date:]
146+
include = rate_data['den'] >= Config.MIN_DEN
147+
valid_rates = rate_data[include]
148+
se_valid = valid_rates.eval('sqrt(rate * (1 - rate) / den)')
149+
rate_data['se'] = se_valid
150+
151+
logging.debug(f"{geo_id}: {rate_data['rate'][-1]:.3f},[{rate_data['se'][-1]:.3f}]")
152+
return {"geo_id": geo_id, "rate": 100 * rate_data['rate'], "se": 100 * rate_data['se'], "incl": include}

emr_hosp/tests/test_sensor.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,30 +25,30 @@ class TestLoadData:
2525
"hrr")
2626

2727
def test_backfill(self):
28-
num0 = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=float)
29-
den0 = pd.Series([0, 10, 10, 10, 10, 10, 10, 100, 101], dtype=float)
28+
num0 = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=float).reshape(-1, 1)
29+
den0 = np.array([0, 10, 10, 10, 10, 10, 10, 100, 101], dtype=float)
3030

3131
num1, den1 = EMRHospSensor.backfill(num0, den0, k=7, min_visits_to_fill=0)
32-
pd.testing.assert_series_equal(num0, num1)
33-
pd.testing.assert_series_equal(den0, den1)
32+
assert np.array_equal(num0, num1)
33+
assert np.array_equal(den0, den1)
3434

3535
num2, den2 = EMRHospSensor.backfill(num0, den0, k=7, min_visits_to_fill=11)
36-
exp_num2 = pd.Series([0, 1, 3, 5, 7, 9, 11, 7, 8], dtype=float)
37-
exp_den2 = pd.Series([0, 10, 20, 20, 20, 20, 20, 100, 101], dtype=float)
38-
pd.testing.assert_series_equal(exp_num2, num2)
39-
pd.testing.assert_series_equal(exp_den2, den2)
40-
36+
exp_num2 = np.array([0, 1, 3, 5, 7, 9, 11, 7, 8], dtype=float).reshape(-1, 1)
37+
exp_den2 = np.array([0, 10, 20, 20, 20, 20, 20, 100, 101], dtype=float)
38+
assert np.array_equal(exp_num2, num2)
39+
assert np.array_equal(exp_den2, den2)
40+
#
4141
num3, den3 = EMRHospSensor.backfill(num0, den0, k=7, min_visits_to_fill=100)
42-
exp_num3 = pd.Series([0, 1, 3, 6, 10, 15, 21, 7, 8], dtype=float)
43-
exp_den3 = pd.Series([0, 10, 20, 30, 40, 50, 60, 100, 101], dtype=float)
44-
pd.testing.assert_series_equal(exp_num3, num3)
45-
pd.testing.assert_series_equal(exp_den3, den3)
46-
42+
exp_num3 = np.array([0, 1, 3, 6, 10, 15, 21, 7, 8], dtype=float).reshape(-1, 1)
43+
exp_den3 = np.array([0, 10, 20, 30, 40, 50, 60, 100, 101], dtype=float)
44+
assert np.array_equal(exp_num3, num3)
45+
assert np.array_equal(exp_den3, den3)
46+
#
4747
num4, den4 = EMRHospSensor.backfill(num0, den0, k=3, min_visits_to_fill=100)
48-
exp_num4 = pd.Series([0, 1, 3, 6, 10, 14, 18, 7, 8], dtype=float)
49-
exp_den4 = pd.Series([0, 10, 20, 30, 40, 40, 40, 100, 101], dtype=float)
50-
pd.testing.assert_series_equal(exp_num4, num4)
51-
pd.testing.assert_series_equal(exp_den4, den4)
48+
exp_num4 = np.array([0, 1, 3, 6, 10, 14, 18, 7, 8], dtype=float).reshape(-1, 1)
49+
exp_den4 = np.array([0, 10, 20, 30, 40, 40, 40, 100, 101], dtype=float)
50+
assert np.array_equal(exp_num4, num4)
51+
assert np.array_equal(exp_den4, den4)
5252

5353
def test_fit_fips(self):
5454
date_range = pd.date_range("2020-05-01", "2020-05-20")
@@ -93,3 +93,5 @@ def test_fit_hrrs(self):
9393
assert np.nanmax(res0["se"]) <= 100 * (0.5 / np.sqrt(Config.MIN_DEN))
9494
assert np.nanmin(res0["se"]) > 0
9595
assert res0["incl"].sum() > 0
96+
97+

0 commit comments

Comments
 (0)