@@ -22,6 +22,19 @@ class EMRHospSensor:
22
22
"""Sensor class to fit a signal using CLI counts from EMR Hospitalization data.
23
23
"""
24
24
25
+ @staticmethod
26
+ def gauss_smooth (count ,total ):
27
+ """smooth using the left_gauss_linear
28
+
29
+ Args:
30
+ count, total: array
31
+ """
32
+ count_smooth = left_gauss_linear (count )
33
+ total_smooth = left_gauss_linear (total )
34
+ total_clip = np .clip (total_smooth , 0 , None )
35
+ count_clip = np .clip (count_smooth , 0 , total_clip )
36
+ return count_clip , total_clip
37
+
25
38
@staticmethod
26
39
def backfill (
27
40
num ,
@@ -37,15 +50,19 @@ def backfill(
37
50
bin size so to avoid inluding long-past values.
38
51
39
52
Args:
40
- num: dataframe of covid counts
41
- den: dataframe of total visits
53
+ num: array of covid counts
54
+ den: array of total visits
42
55
k: maximum number of days used to average a backfill correction
43
56
min_visits_to_fill: minimum number of total visits needed in order to sum a bin
44
57
45
58
Returns: dataframes of adjusted covid counts, adjusted visit counts, inclusion array
46
59
"""
47
- revden = den [::- 1 ].values
48
- revnum = num [::- 1 ].values .reshape (- 1 , 1 )
60
+ if isinstance (den ,(pd .DataFrame ,pd .Series )):
61
+ den = den .values
62
+ if isinstance (num ,(pd .DataFrame ,pd .Series )):
63
+ num = num .values
64
+ revden = den [::- 1 ]
65
+ revnum = num [::- 1 ].reshape (- 1 , 1 )
49
66
new_num = np .full_like (revnum , np .nan , dtype = float )
50
67
new_den = np .full_like (revden , np .nan , dtype = float )
51
68
n , p = revnum .shape
@@ -76,40 +93,37 @@ def backfill(
76
93
new_num = new_num [::- 1 ]
77
94
new_den = new_den [::- 1 ]
78
95
79
- # reset date index and format
80
- new_num = pd .Series (new_num .flatten (), name = num .name , index = num .index )
81
- new_den = pd .Series (new_den , index = den .index )
82
-
83
96
return new_num , new_den
84
97
85
98
@staticmethod
86
- def fit (y_data , sensor_dates , geo_id ):
99
+ def fit (y_data , first_sensor_date , geo_id , num_col = "num" , den_col = "den" ):
87
100
"""Fitting routine.
88
101
89
102
Args:
90
103
y_data: dataframe for one geo_id, indexed by date
91
- sensor_dates: list of sorted datetime for which to produce sensor values
104
+ first_sensor_date: datetime of first date
92
105
geo_id: unique identifier for the location column
106
+ num_col: str name of numerator column
107
+ den_col: str name of denominator column
93
108
94
109
Returns:
95
110
dictionary of results
96
111
97
112
"""
98
- # values to keep
99
- fitting_idxs = np .where (y_data .index >= sensor_dates [0 ])[0 ]
100
-
101
113
# backfill
102
- total_counts , total_visits = EMRHospSensor .backfill (y_data ["num" ], y_data ["den" ])
114
+ total_counts , total_visits = EMRHospSensor .backfill (y_data [num_col ].values , y_data [den_col ].values )
115
+ # total_counts = pd.Series(total_counts.flatten(), name=num_col, index=y_data.index)
116
+ # total_visits = pd.Series(total_visits, index=y_data.index)
103
117
104
118
# calculate smoothed counts and jeffreys rate
105
119
# the left_gauss_linear smoother is not guaranteed to return values greater than 0
106
- smoothed_total_counts = np . clip ( left_gauss_linear ( total_counts . values ), 0 , None )
107
- smoothed_total_visits = np . clip ( left_gauss_linear ( total_visits . values ), 0 , None )
120
+
121
+ smoothed_total_counts , smoothed_total_visits = EMRHospSensor . gauss_smooth ( total_counts . flatten (), total_visits )
108
122
109
123
# in smoothing, the numerator may have become more than the denominator
110
124
# simple fix is to clip the max values elementwise to the denominator (note that
111
125
# this has only been observed in synthetic data)
112
- smoothed_total_counts = np .clip (smoothed_total_counts , 0 , smoothed_total_visits )
126
+ # smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)
113
127
114
128
smoothed_total_rates = (
115
129
(smoothed_total_counts + 0.5 ) / (smoothed_total_visits + 1 )
@@ -124,14 +138,12 @@ def fit(y_data, sensor_dates, geo_id):
124
138
), f"0 or negative value, { geo_id } "
125
139
126
140
# cut off at sensor indexes
127
- rates = smoothed_total_rates [fitting_idxs ]
128
- den = smoothed_total_visits [fitting_idxs ]
129
- include = den >= Config .MIN_DEN
130
-
131
- # calculate standard error
132
- se = np .full_like (rates , np .nan )
133
- se [include ] = np .sqrt (
134
- np .divide ((rates [include ] * (1 - rates [include ])), den [include ]))
135
-
136
- logging .debug (f"{ geo_id } : { rates [- 1 ]:.3f} ,[{ se [- 1 ]:.3f} ]" )
137
- return {"geo_id" : geo_id , "rate" : 100 * rates , "se" : 100 * se , "incl" : include }
141
+ rate_data = pd .DataFrame ({'rate' :smoothed_total_rates , 'den' : smoothed_total_visits }, index = y_data .index )
142
+ rate_data = rate_data [first_sensor_date :]
143
+ include = rate_data ['den' ] >= Config .MIN_DEN
144
+ valid_rates = rate_data [include ]
145
+ se_valid = valid_rates .eval ('sqrt(rate * (1 - rate) / den)' )
146
+ rate_data ['se' ] = se_valid
147
+
148
+ logging .debug (f"{ geo_id } : { rate_data ['rate' ][- 1 ]:.3f} ,[{ rate_data ['se' ][- 1 ]:.3f} ]" )
149
+ return {"geo_id" : geo_id , "rate" : 100 * rate_data ['rate' ], "se" : 100 * rate_data ['se' ], "incl" : include }
0 commit comments