2
2
from os .path import join , basename
3
3
4
4
import pandas as pd
5
+ import numpy as np
6
+ from delphi_jhu .run import add_nancodes
7
+ from delphi_utils import Nans
5
8
6
9
10
+ def _non_ignored_files_set (directory ):
11
+ """List all files in a directory not preceded by a '.' and store them in a set."""
12
+ out = {fname for fname in listdir (directory ) if not basename (fname ).startswith ("." )}
13
+ return out
14
+
7
15
class TestRun :
8
16
def test_output_files_exist (self , run_as_module ):
9
17
10
- csv_files = [ x for x in listdir ("receiving" ) if not basename ( x ). startswith ( "." )]
18
+ csv_files = _non_ignored_files_set ("receiving" )
11
19
12
20
dates = [
13
21
"20200303" ,
@@ -17,29 +25,86 @@ def test_output_files_exist(self, run_as_module):
17
25
"20200307" ,
18
26
"20200308" ,
19
27
"20200309" ,
20
- "20200310" ,
28
+ "20200310"
21
29
]
22
30
geos = ["county" , "hrr" , "msa" , "state" , "hhs" , "nation" ]
23
- metrics = []
24
- for event in ["confirmed" , "deaths" ]:
25
- for smoothing in ["" , "_7dav" ]:
26
- for window in ["incidence" , "cumulative" ]:
27
- for stat in ["num" , "prop" ]:
28
- metrics .append (f"{ event } { smoothing } _{ window } _{ stat } " )
29
-
30
- expected_files = []
31
- for date in dates :
32
- for geo in geos :
33
- for metric in metrics :
34
- # Can't compute 7dav for first few days of data because of NAs
35
- if date > "20200305" or "7dav" not in metric :
36
- expected_files += [date + "_" + geo + "_" + metric + ".csv" ]
37
-
38
- assert set (csv_files ) == set (expected_files )
31
+ signals = ["confirmed" , "deaths" ]
32
+ metrics = [
33
+ "cumulative_num" ,
34
+ "cumulative_prop" ,
35
+ "incidence_num" ,
36
+ "incidence_prop" ,
37
+ "7dav_incidence_num" ,
38
+ "7dav_incidence_prop" ,
39
+ "7dav_cumulative_num" ,
40
+ "7dav_cumulative_prop" ,
41
+ ]
42
+
43
+ expected_files = {
44
+ date + "_" + geo + "_" + signal + "_" + metric + ".csv"
45
+ for date in dates
46
+ for geo in geos
47
+ for signal in signals
48
+ for metric in metrics
49
+ }
50
+
51
+ assert csv_files == expected_files
39
52
40
53
def test_output_file_format (self , run_as_module ):
41
54
42
55
df = pd .read_csv (
43
56
join ("receiving" , "20200310_state_confirmed_cumulative_num.csv" )
44
57
)
45
- assert (df .columns .values == ["geo_id" , "val" , "se" , "sample_size" ]).all ()
58
+ assert (
59
+ df .columns .values
60
+ == [
61
+ "geo_id" ,
62
+ "val" ,
63
+ "se" ,
64
+ "sample_size" ,
65
+ "missing_val" ,
66
+ "missing_se" ,
67
+ "missing_sample_size" ,
68
+ ]
69
+ ).all ()
70
+
71
+ def test_add_nancodes (self ):
72
+ df = pd .DataFrame ({
73
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
74
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
75
+ "val" : [0.1 , 0.2 , 0.3 , 0.4 , 0.5 , np .nan , 0.7 , np .nan ],
76
+ "se" : [np .nan ] * 8 ,
77
+ "sample_size" : [np .nan ] * 8
78
+ }).set_index (["timestamp" , "geo_id" ])
79
+ expected_df = pd .DataFrame ({
80
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
81
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
82
+ "val" : [0.1 , 0.2 , 0.3 , 0.4 , 0.5 , np .nan , 0.7 , np .nan ],
83
+ "se" : [np .nan ] * 8 ,
84
+ "sample_size" : [np .nan ] * 8 ,
85
+ "missing_val" : [Nans .NOT_MISSING ] * 5 + [Nans .REGION_EXCEPTION , Nans .NOT_MISSING , Nans .UNKNOWN ],
86
+ "missing_se" : [Nans .NOT_APPLICABLE ] * 8 ,
87
+ "missing_sample_size" : [Nans .NOT_APPLICABLE ] * 8 ,
88
+ }).set_index (["timestamp" , "geo_id" ])
89
+
90
+ pd .testing .assert_frame_equal (add_nancodes (df , "deaths" , "county" , None ), expected_df )
91
+
92
+ df2 = pd .DataFrame ({
93
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
94
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
95
+ "val" : [np .nan ] * 6 + [0.7 , np .nan ],
96
+ "se" : [np .nan ] * 8 ,
97
+ "sample_size" : [np .nan ] * 8
98
+ }).set_index (["timestamp" , "geo_id" ])
99
+ expected_df2 = pd .DataFrame ({
100
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
101
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
102
+ "val" : [np .nan ] * 6 + [0.7 , np .nan ],
103
+ "se" : [np .nan ] * 8 ,
104
+ "sample_size" : [np .nan ] * 8 ,
105
+ "missing_val" : [Nans .PRIVACY ] * 5 + [Nans .REGION_EXCEPTION , Nans .NOT_MISSING , Nans .UNKNOWN ],
106
+ "missing_se" : [Nans .NOT_APPLICABLE ] * 8 ,
107
+ "missing_sample_size" : [Nans .NOT_APPLICABLE ] * 8 ,
108
+ }).set_index (["timestamp" , "geo_id" ])
109
+
110
+ pd .testing .assert_frame_equal (add_nancodes (df2 , "deaths" , "county" , "seven_day_average" ), expected_df2 )
0 commit comments