10
10
from boto3 import Session
11
11
from moto import mock_s3
12
12
import pytest
13
+ import mock
13
14
14
15
# first party
15
16
from delphi_changehc .config import Config
16
- from delphi_changehc .update_sensor import write_to_csv , CHCSensorUpdator
17
+ from delphi_changehc .update_sensor import add_nancodes , censor_columns , write_to_csv , CHCSensorUpdator
18
+ from delphi_utils .nancodes import Nans
17
19
18
20
CONFIG = Config ()
19
21
PARAMS = {
@@ -92,7 +94,8 @@ def test_geo_reindex(self):
92
94
def test_update_sensor (self ):
93
95
"""Tests that the sensors are properly updated."""
94
96
outputs = {}
95
- for geo in ["county" , "state" , "hhs" , "nation" ]:
97
+ geos = ["county" , "state" , "hhs" , "nation" ]
98
+ for geo in geos :
96
99
td = TemporaryDirectory ()
97
100
su_inst = CHCSensorUpdator (
98
101
"03-01-2020" ,
@@ -122,11 +125,10 @@ def test_update_sensor(self):
122
125
assert len (os .listdir (td .name )) == len (su_inst .sensor_dates ),\
123
126
f"failed { geo } update sensor test"
124
127
td .cleanup ()
125
- assert outputs ["20200319_county_smoothed_outpatient_covid.csv" ].empty
126
- assert outputs ["20200319_state_smoothed_outpatient_covid.csv" ].empty
127
- assert outputs ["20200319_hhs_smoothed_outpatient_covid.csv" ].empty
128
- assert outputs ["20200319_nation_smoothed_outpatient_covid.csv" ].empty
129
-
128
+ value_columns = ["val" , "se" , "sample_size" ]
129
+ for geo in geos :
130
+ assert np .isnan (outputs ["20200319_" + geo + "_smoothed_outpatient_covid.csv" ][value_columns ]).all ().all ()
131
+ assert outputs ["20200319_" + geo + "_smoothed_outpatient_covid.csv" ]["missing_val" ].eq (3 ).all ()
130
132
131
133
class TestWriteToCsv :
132
134
"""Tests for writing output files to CSV."""
@@ -137,16 +139,18 @@ def test_write_to_csv_results(self):
137
139
"se" : [0.1 , 1 , 1.1 ] + [0.5 , np .nan , 0.5 ],
138
140
"sample_size" : [np .nan ] * 6 ,
139
141
"timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
140
- "include " : [True , True , True ] + [True , False , True ],
142
+ "incl " : [True , True , True ] + [True , False , True ],
141
143
"geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
142
144
})
143
145
144
146
td = TemporaryDirectory ()
145
147
148
+ res0 = censor_columns (res0 , ["sample_size" , "se" ])
149
+ res0 = add_nancodes (res0 , write_se = False )
150
+
146
151
write_to_csv (
147
- res0 [ res0 [ 'include' ]] ,
152
+ res0 ,
148
153
geo_level = "geography" ,
149
- write_se = False ,
150
154
day_shift = CONFIG .DAY_SHIFT ,
151
155
out_name = "name_of_signal" ,
152
156
output_path = td .name
@@ -156,7 +160,10 @@ def test_write_to_csv_results(self):
156
160
expected_name = "20200502_geography_name_of_signal.csv"
157
161
assert exists (join (td .name , expected_name ))
158
162
output_data = pd .read_csv (join (td .name , expected_name ))
159
- expected_columns = ["geo_id" , "val" , "se" , "sample_size" ]
163
+ expected_columns = [
164
+ "geo_id" , "val" , "se" , "sample_size" ,
165
+ "missing_val" , "missing_se" , "missing_sample_size"
166
+ ]
160
167
assert (output_data .columns == expected_columns ).all ()
161
168
assert (output_data .geo_id == ["a" , "b" ]).all ()
162
169
assert np .array_equal (output_data .val .values , np .array ([0.1 , 1 ]))
@@ -169,8 +176,8 @@ def test_write_to_csv_results(self):
169
176
assert exists (join (td .name , expected_name ))
170
177
output_data = pd .read_csv (join (td .name , expected_name ))
171
178
assert (output_data .columns == expected_columns ).all ()
172
- assert (output_data .geo_id == ["a" ]).all ()
173
- assert np .array_equal (output_data .val .values , np .array ([0.5 ]) )
179
+ assert (output_data .geo_id == ["a" , "b" ]).all ()
180
+ assert np .array_equal (output_data .val .values , np .array ([0.5 , np . nan ]), equal_nan = True )
174
181
assert np .isnan (output_data .se .values ).all ()
175
182
assert np .isnan (output_data .sample_size .values ).all ()
176
183
@@ -192,88 +199,100 @@ def test_write_to_csv_with_se_results(self):
192
199
"se" : [0.1 , 1 , 1.1 ] + [0.5 , np .nan , 0.5 ],
193
200
"sample_size" : [np .nan ] * 6 ,
194
201
"timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
195
- "include " : [True , True , True ] + [True , False , True ],
202
+ "incl " : [True , True , True ] + [True , False , True ],
196
203
"geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
197
204
})
198
205
206
+ res0 = add_nancodes (res0 , write_se = True )
207
+
199
208
td = TemporaryDirectory ()
200
209
write_to_csv (
201
- res0 [ res0 [ 'include' ]] ,
210
+ res0 ,
202
211
geo_level = "geography" ,
203
- write_se = True ,
204
212
day_shift = CONFIG .DAY_SHIFT ,
205
213
out_name = "name_of_signal" ,
206
214
output_path = td .name
207
215
)
208
216
209
217
# check outputs
210
218
expected_name = "20200502_geography_name_of_signal.csv"
219
+ expected_columns = [
220
+ "geo_id" , "val" , "se" , "sample_size" ,
221
+ "missing_val" , "missing_se" , "missing_sample_size"
222
+ ]
211
223
assert exists (join (td .name , expected_name ))
212
224
output_data = pd .read_csv (join (td .name , expected_name ))
213
- expected_columns = ["geo_id" , "val" , "se" , "sample_size" ]
214
225
assert (output_data .columns == expected_columns ).all ()
215
226
assert (output_data .geo_id == ["a" , "b" ]).all ()
216
227
assert np .array_equal (output_data .val .values , np .array ([0.1 , 1 ]))
217
228
assert np .array_equal (output_data .se .values , np .array ([0.1 , 0.5 ]))
218
229
assert np .isnan (output_data .sample_size .values ).all ()
219
230
td .cleanup ()
220
231
221
- def test_write_to_csv_wrong_results (self ):
222
- """Tests that nonsensical inputs trigger exceptions."""
232
+ def test_suspicious_value_logging (self ):
223
233
res0 = pd .DataFrame ({
224
- "val" : [0.1 , 0.5 , 1.5 ] + [1 , 2 , 3 ],
225
- "se" : [0.1 , 1 , 1.1 ] + [0.5 , 0.5 , 0.5 ],
234
+ "val" : [91 , 0.5 , 1.5 ] + [1 , 2 , 3 ],
235
+ "se" : [0.1 , 1 , 1.1 ] + [0.5 , np . nan , 0.5 ],
226
236
"sample_size" : [np .nan ] * 6 ,
227
237
"timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
228
- "include " : [True , True , True ] + [True , False , True ],
238
+ "incl " : [True , True , True ] + [True , False , True ],
229
239
"geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
230
- }).set_index (["timestamp" , "geo_id" ]).sort_index ()
240
+ })
241
+
242
+ res0 = add_nancodes (res0 , write_se = True )
231
243
244
+ mock_logger = mock .Mock ()
232
245
td = TemporaryDirectory ()
246
+ write_to_csv (
247
+ res0 ,
248
+ geo_level = "geography" ,
249
+ day_shift = CONFIG .DAY_SHIFT ,
250
+ out_name = "name_of_signal" ,
251
+ output_path = td .name ,
252
+ logger = mock_logger
253
+ )
233
254
234
- # nan value for included loc-date
235
- res1 = res0 .copy ()
236
- res1 = res1 [res1 ['include' ]]
237
- res1 .loc [("2020-05-01" , "a" ), "val" ] = np .nan
238
- res1 .reset_index (inplace = True )
239
- with pytest .raises (AssertionError ):
240
- write_to_csv (
241
- res1 ,
242
- geo_level = "geography" ,
243
- write_se = False ,
244
- day_shift = CONFIG .DAY_SHIFT ,
245
- out_name = "name_of_signal" ,
246
- output_path = td .name
247
- )
255
+ mock_logger .warning .assert_called_once_with (
256
+ "value suspiciously high, {0}: {1}" .format ("a" , "name_of_signal" )
257
+ )
248
258
249
- # nan se for included loc-date
250
- res2 = res0 .copy ()
251
- res2 = res2 [res2 ['include' ]]
252
- res2 .loc [("2020-05-01" , "a" ), "se" ] = np .nan
253
- res2 .reset_index (inplace = True )
254
- with pytest .raises (AssertionError ):
255
- write_to_csv (
256
- res2 ,
257
- geo_level = "geography" ,
258
- write_se = True ,
259
- day_shift = CONFIG .DAY_SHIFT ,
260
- out_name = "name_of_signal" ,
261
- output_path = td .name
262
- )
259
+ def test_add_nancodes (self ):
260
+ """Tests that nancodes are correctly addded."""
261
+ res0 = pd .DataFrame ({
262
+ "val" : [np .nan , 0.5 , 1.5 ] + [1 , 2 , 3 ],
263
+ "se" : [np .nan , 1 , 1.1 ] + [np .nan , np .nan , 0.5 ],
264
+ "sample_size" : [np .nan ] * 6 ,
265
+ "timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
266
+ "incl" : [True , True , True ] + [True , False , True ],
267
+ "geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
268
+ }).set_index (["timestamp" , "geo_id" ]).sort_index ()
263
269
264
- # large se value
265
- res3 = res0 .copy ()
266
- res3 = res3 [res3 ['include' ]]
267
- res3 .loc [("2020-05-01" , "a" ), "se" ] = 10
268
- res3 .reset_index (inplace = True )
269
- with pytest .raises (AssertionError ):
270
- write_to_csv (
271
- res3 ,
272
- geo_level = "geography" ,
273
- write_se = True ,
274
- day_shift = CONFIG .DAY_SHIFT ,
275
- out_name = "name_of_signal" ,
276
- output_path = td .name
277
- )
270
+ expected_df = pd .DataFrame ({
271
+ "val" : [np .nan , 0.5 , 1.5 ] + [1 , np .nan , 3 ],
272
+ "se" : [np .nan , 1 , 1.1 ] + [np .nan , np .nan , 0.5 ],
273
+ "sample_size" : [np .nan ] * 6 ,
274
+ "timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
275
+ "incl" : [True , True , True ] + [True , False , True ],
276
+ "geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
277
+ "missing_val" : [Nans .OTHER ] + [Nans .NOT_MISSING ] * 3 + [Nans .CENSORED , Nans .NOT_MISSING ],
278
+ "missing_se" : [Nans .OTHER ] + [Nans .NOT_MISSING ] * 2 + [Nans .OTHER , Nans .CENSORED , Nans .NOT_MISSING ],
279
+ "missing_sample_size" : [Nans .CENSORED ] * 6 ,
280
+ }).set_index (["timestamp" , "geo_id" ]).sort_index ()
278
281
279
- td .cleanup ()
282
+ res = censor_columns (res0 , ["sample_size" ])
283
+ pd .testing .assert_frame_equal (expected_df , add_nancodes (res , write_se = True ))
284
+
285
+ expected_df = pd .DataFrame ({
286
+ "val" : [np .nan , 0.5 , 1.5 ] + [1 , np .nan , 3 ],
287
+ "se" : [np .nan ] * 6 ,
288
+ "sample_size" : [np .nan ] * 6 ,
289
+ "timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
290
+ "incl" : [True , True , True ] + [True , False , True ],
291
+ "geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
292
+ "missing_val" : [Nans .OTHER ] + [Nans .NOT_MISSING ] * 3 + [Nans .CENSORED , Nans .NOT_MISSING ],
293
+ "missing_se" : [Nans .CENSORED ] * 6 ,
294
+ "missing_sample_size" : [Nans .CENSORED ] * 6 ,
295
+ }).set_index (["timestamp" , "geo_id" ]).sort_index ()
296
+
297
+ res = censor_columns (res0 , ["sample_size" , "se" ])
298
+ pd .testing .assert_frame_equal (expected_df , add_nancodes (res , write_se = False ))
0 commit comments