11
11
from boto3 import Session
12
12
from moto import mock_s3
13
13
import pytest
14
+ import mock
14
15
15
16
# first party
16
17
from delphi_changehc .config import Config
17
- from delphi_changehc .update_sensor import write_to_csv , CHCSensorUpdater
18
+ from delphi_changehc .update_sensor import add_nancodes , censor_columns , write_to_csv , CHCSensorUpdater
19
+ from delphi_utils .nancodes import Nans
18
20
19
21
CONFIG = Config ()
20
22
PARAMS = {
@@ -96,7 +98,8 @@ def test_geo_reindex(self):
96
98
def test_update_sensor (self ):
97
99
"""Tests that the sensors are properly updated."""
98
100
outputs = {}
99
- for geo in ["county" , "state" , "hhs" , "nation" ]:
101
+ geos = ["county" , "state" , "hhs" , "nation" ]
102
+ for geo in geos :
100
103
td = TemporaryDirectory ()
101
104
su_inst = CHCSensorUpdater (
102
105
"03-01-2020" ,
@@ -127,11 +130,10 @@ def test_update_sensor(self):
127
130
assert len (os .listdir (td .name )) == len (su_inst .sensor_dates ),\
128
131
f"failed { geo } update sensor test"
129
132
td .cleanup ()
130
- assert outputs ["20200319_county_smoothed_outpatient_covid.csv" ].empty
131
- assert outputs ["20200319_state_smoothed_outpatient_covid.csv" ].empty
132
- assert outputs ["20200319_hhs_smoothed_outpatient_covid.csv" ].empty
133
- assert outputs ["20200319_nation_smoothed_outpatient_covid.csv" ].empty
134
-
133
+ value_columns = ["val" , "se" , "sample_size" ]
134
+ for geo in geos :
135
+ assert np .isnan (outputs ["20200319_" + geo + "_smoothed_outpatient_covid.csv" ][value_columns ]).all ().all ()
136
+ assert outputs ["20200319_" + geo + "_smoothed_outpatient_covid.csv" ]["missing_val" ].eq (3 ).all ()
135
137
136
138
class TestWriteToCsv :
137
139
"""Tests for writing output files to CSV."""
@@ -142,16 +144,19 @@ def test_write_to_csv_results(self):
142
144
"se" : [0.1 , 1 , 1.1 ] + [0.5 , np .nan , 0.5 ],
143
145
"sample_size" : [np .nan ] * 6 ,
144
146
"timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
145
- "include " : [True , True , True ] + [True , False , True ],
147
+ "incl " : [True , True , True ] + [True , False , True ],
146
148
"geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
147
149
})
148
150
149
151
td = TemporaryDirectory ()
150
152
153
+ res0 = censor_columns (res0 , ["sample_size" , "se" ])
154
+ res0 = add_nancodes (res0 , write_se = False )
155
+
151
156
write_to_csv (
152
- res0 [ res0 [ 'include' ]] ,
157
+ res0 ,
153
158
geo_level = "geography" ,
154
- write_se = False ,
159
+ write_se = True ,
155
160
day_shift = CONFIG .DAY_SHIFT ,
156
161
out_name = "name_of_signal" ,
157
162
output_path = td .name ,
@@ -162,7 +167,10 @@ def test_write_to_csv_results(self):
162
167
expected_name = "20200502_geography_name_of_signal.csv"
163
168
assert exists (join (td .name , expected_name ))
164
169
output_data = pd .read_csv (join (td .name , expected_name ))
165
- expected_columns = ["geo_id" , "val" , "se" , "sample_size" ]
170
+ expected_columns = [
171
+ "geo_id" , "val" , "se" , "sample_size" ,
172
+ "missing_val" , "missing_se" , "missing_sample_size"
173
+ ]
166
174
assert (output_data .columns == expected_columns ).all ()
167
175
assert (output_data .geo_id == ["a" , "b" ]).all ()
168
176
assert np .array_equal (output_data .val .values , np .array ([0.1 , 1 ]))
@@ -175,8 +183,8 @@ def test_write_to_csv_results(self):
175
183
assert exists (join (td .name , expected_name ))
176
184
output_data = pd .read_csv (join (td .name , expected_name ))
177
185
assert (output_data .columns == expected_columns ).all ()
178
- assert (output_data .geo_id == ["a" ]).all ()
179
- assert np .array_equal (output_data .val .values , np .array ([0.5 ]) )
186
+ assert (output_data .geo_id == ["a" , "b" ]).all ()
187
+ assert np .array_equal (output_data .val .values , np .array ([0.5 , np . nan ]), equal_nan = True )
180
188
assert np .isnan (output_data .se .values ).all ()
181
189
assert np .isnan (output_data .sample_size .values ).all ()
182
190
@@ -198,13 +206,15 @@ def test_write_to_csv_with_se_results(self):
198
206
"se" : [0.1 , 1 , 1.1 ] + [0.5 , np .nan , 0.5 ],
199
207
"sample_size" : [np .nan ] * 6 ,
200
208
"timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
201
- "include " : [True , True , True ] + [True , False , True ],
209
+ "incl " : [True , True , True ] + [True , False , True ],
202
210
"geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
203
211
})
204
212
213
+ res0 = add_nancodes (res0 , write_se = True )
214
+
205
215
td = TemporaryDirectory ()
206
216
write_to_csv (
207
- res0 [ res0 [ 'include' ]] ,
217
+ res0 ,
208
218
geo_level = "geography" ,
209
219
write_se = True ,
210
220
day_shift = CONFIG .DAY_SHIFT ,
@@ -215,64 +225,46 @@ def test_write_to_csv_with_se_results(self):
215
225
216
226
# check outputs
217
227
expected_name = "20200502_geography_name_of_signal.csv"
228
+ expected_columns = [
229
+ "geo_id" , "val" , "se" , "sample_size" ,
230
+ "missing_val" , "missing_se" , "missing_sample_size"
231
+ ]
218
232
assert exists (join (td .name , expected_name ))
219
233
output_data = pd .read_csv (join (td .name , expected_name ))
220
- expected_columns = ["geo_id" , "val" , "se" , "sample_size" ]
221
234
assert (output_data .columns == expected_columns ).all ()
222
235
assert (output_data .geo_id == ["a" , "b" ]).all ()
223
236
assert np .array_equal (output_data .val .values , np .array ([0.1 , 1 ]))
224
237
assert np .array_equal (output_data .se .values , np .array ([0.1 , 0.5 ]))
225
238
assert np .isnan (output_data .sample_size .values ).all ()
226
239
td .cleanup ()
227
240
228
- def test_write_to_csv_wrong_results (self ):
229
- """Tests that nonsensical inputs trigger exceptions."""
241
+ def test_suspicious_value_logging (self ):
230
242
res0 = pd .DataFrame ({
231
- "val" : [0.1 , 0.5 , 1.5 ] + [1 , 2 , 3 ],
232
- "se" : [0.1 , 1 , 1.1 ] + [0.5 , 0.5 , 0.5 ],
243
+ "val" : [91 , 0.5 , 1.5 ] + [1 , 2 , 3 ],
244
+ "se" : [0.1 , 1 , 1.1 ] + [0.5 , np . nan , 0.5 ],
233
245
"sample_size" : [np .nan ] * 6 ,
234
246
"timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
235
- "include " : [True , True , True ] + [True , False , True ],
247
+ "incl " : [True , True , True ] + [True , False , True ],
236
248
"geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
237
- }).set_index (["timestamp" , "geo_id" ]).sort_index ()
238
-
239
- td = TemporaryDirectory ()
249
+ })
240
250
241
- # nan value for included loc-date
242
- res1 = res0 .copy ()
243
- res1 = res1 [res1 ['include' ]]
244
- res1 .loc [("2020-05-01" , "a" ), "val" ] = np .nan
245
- res1 .reset_index (inplace = True )
246
- with pytest .raises (AssertionError ):
247
- write_to_csv (
248
- res1 ,
249
- geo_level = "geography" ,
250
- write_se = False ,
251
- day_shift = CONFIG .DAY_SHIFT ,
252
- out_name = "name_of_signal" ,
253
- output_path = td .name ,
254
- logger = TEST_LOGGER
255
- )
251
+ res0 = add_nancodes (res0 , write_se = True )
256
252
257
- # nan se for included loc-date
258
- res2 = res0 .copy ()
259
- res2 = res2 [res2 ['include' ]]
260
- res2 .loc [("2020-05-01" , "a" ), "se" ] = np .nan
261
- res2 .reset_index (inplace = True )
262
- with pytest .raises (AssertionError ):
263
- write_to_csv (
264
- res2 ,
265
- geo_level = "geography" ,
266
- write_se = True ,
267
- day_shift = CONFIG .DAY_SHIFT ,
268
- out_name = "name_of_signal" ,
269
- output_path = td .name ,
270
- logger = TEST_LOGGER
271
- )
253
+ mock_logger = mock .Mock ()
254
+ td = TemporaryDirectory ()
255
+ write_to_csv (
256
+ res0 ,
257
+ geo_level = "geography" ,
258
+ write_se = True ,
259
+ day_shift = CONFIG .DAY_SHIFT ,
260
+ out_name = "name_of_signal" ,
261
+ output_path = td .name ,
262
+ logger = mock_logger
263
+ )
272
264
273
265
# large se value
274
- res3 = res0 .copy ()
275
- res3 = res3 [res3 ['include ' ]]
266
+ res3 = res0 .copy (). set_index ([ "timestamp" , "geo_id" ])
267
+ res3 = res3 [res3 ['incl ' ]]
276
268
res3 .loc [("2020-05-01" , "a" ), "se" ] = 10
277
269
res3 .reset_index (inplace = True )
278
270
with pytest .raises (AssertionError ):
@@ -286,4 +278,47 @@ def test_write_to_csv_wrong_results(self):
286
278
logger = TEST_LOGGER
287
279
)
288
280
289
- td .cleanup ()
281
+ mock_logger .warning .assert_called_once_with (
282
+ "value suspiciously high, {0}: {1}" .format ("a" , "name_of_signal" )
283
+ )
284
+
285
+ def test_add_nancodes (self ):
286
+ """Tests that nancodes are correctly addded."""
287
+ res0 = pd .DataFrame ({
288
+ "val" : [np .nan , 0.5 , 1.5 ] + [1 , 2 , 3 ],
289
+ "se" : [np .nan , 1 , 1.1 ] + [np .nan , np .nan , 0.5 ],
290
+ "sample_size" : [np .nan ] * 6 ,
291
+ "timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
292
+ "incl" : [True , True , True ] + [True , False , True ],
293
+ "geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
294
+ }).set_index (["timestamp" , "geo_id" ]).sort_index ()
295
+
296
+ expected_df = pd .DataFrame ({
297
+ "val" : [np .nan , 0.5 , 1.5 ] + [1 , np .nan , 3 ],
298
+ "se" : [np .nan , 1 , 1.1 ] + [np .nan , np .nan , 0.5 ],
299
+ "sample_size" : [np .nan ] * 6 ,
300
+ "timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
301
+ "incl" : [True , True , True ] + [True , False , True ],
302
+ "geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
303
+ "missing_val" : [Nans .OTHER ] + [Nans .NOT_MISSING ] * 3 + [Nans .CENSORED , Nans .NOT_MISSING ],
304
+ "missing_se" : [Nans .OTHER ] + [Nans .NOT_MISSING ] * 2 + [Nans .OTHER , Nans .CENSORED , Nans .NOT_MISSING ],
305
+ "missing_sample_size" : [Nans .CENSORED ] * 6 ,
306
+ }).set_index (["timestamp" , "geo_id" ]).sort_index ()
307
+
308
+ res = censor_columns (res0 , ["sample_size" ])
309
+ pd .testing .assert_frame_equal (expected_df , add_nancodes (res , write_se = True ))
310
+
311
+ expected_df = pd .DataFrame ({
312
+ "val" : [np .nan , 0.5 , 1.5 ] + [1 , np .nan , 3 ],
313
+ "se" : [np .nan ] * 6 ,
314
+ "sample_size" : [np .nan ] * 6 ,
315
+ "timestamp" : pd .to_datetime (["2020-05-01" , "2020-05-02" , "2020-05-04" ] * 2 ),
316
+ "incl" : [True , True , True ] + [True , False , True ],
317
+ "geo_id" : ["a" ] * 3 + ["b" ] * 3 ,
318
+ "missing_val" : [Nans .OTHER ] + [Nans .NOT_MISSING ] * 3 + [Nans .CENSORED , Nans .NOT_MISSING ],
319
+ "missing_se" : [Nans .CENSORED ] * 6 ,
320
+ "missing_sample_size" : [Nans .CENSORED ] * 6 ,
321
+ }).set_index (["timestamp" , "geo_id" ]).sort_index ()
322
+
323
+ res = censor_columns (res0 , ["sample_size" , "se" ])
324
+ pd .testing .assert_frame_equal (expected_df , add_nancodes (res , write_se = False ))
0 commit comments