2
2
from os .path import join , basename
3
3
4
4
import pandas as pd
5
+ import numpy as np
6
+ from delphi_jhu .run import add_nancodes
7
+ from delphi_utils import Nans
8
+
9
+ def _non_ignored_files_set (directory ):
10
+ """List all files in a directory not preceded by a '.' and store them in a set."""
11
+ out = {fname for fname in listdir (directory ) if not basename (fname ).startswith ("." )}
12
+ return out
13
+
14
+ def _non_ignored_files_set (directory ):
15
+ """List all files in a directory not preceded by a '.' and store them in a set."""
16
+ out = {fname for fname in listdir (directory ) if not basename (fname ).startswith ("." )}
17
+ return out
5
18
6
19
7
20
class TestRun :
8
21
def test_output_files_exist (self , run_as_module ):
9
22
10
- csv_files = [ x for x in listdir ("receiving" ) if not basename ( x ). startswith ( "." )]
23
+ csv_files = _non_ignored_files_set ("receiving" )
11
24
12
25
dates = [
13
26
"20200303" ,
@@ -17,29 +30,86 @@ def test_output_files_exist(self, run_as_module):
17
30
"20200307" ,
18
31
"20200308" ,
19
32
"20200309" ,
20
- "20200310" ,
33
+ "20200310"
21
34
]
22
35
geos = ["county" , "hrr" , "msa" , "state" , "hhs" , "nation" ]
23
- metrics = []
24
- for event in ["confirmed" , "deaths" ]:
25
- for smoothing in ["" , "_7dav" ]:
26
- for window in ["incidence" , "cumulative" ]:
27
- for stat in ["num" , "prop" ]:
28
- metrics .append (f"{ event } { smoothing } _{ window } _{ stat } " )
29
-
30
- expected_files = []
31
- for date in dates :
32
- for geo in geos :
33
- for metric in metrics :
34
- # Can't compute 7dav for first few days of data because of NAs
35
- if date > "20200305" or "7dav" not in metric :
36
- expected_files += [date + "_" + geo + "_" + metric + ".csv" ]
37
-
38
- assert set (csv_files ) == set (expected_files )
36
+ signals = ["confirmed" , "deaths" ]
37
+ metrics = [
38
+ "cumulative_num" ,
39
+ "cumulative_prop" ,
40
+ "incidence_num" ,
41
+ "incidence_prop" ,
42
+ "7dav_incidence_num" ,
43
+ "7dav_incidence_prop" ,
44
+ "7dav_cumulative_num" ,
45
+ "7dav_cumulative_prop" ,
46
+ ]
47
+
48
+ expected_files = {
49
+ date + "_" + geo + "_" + signal + "_" + metric + ".csv"
50
+ for date in dates
51
+ for geo in geos
52
+ for signal in signals
53
+ for metric in metrics
54
+ }
55
+
56
+ assert csv_files == expected_files
39
57
40
58
def test_output_file_format (self , run_as_module ):
41
59
42
60
df = pd .read_csv (
43
61
join ("receiving" , "20200310_state_confirmed_cumulative_num.csv" )
44
62
)
45
- assert (df .columns .values == ["geo_id" , "val" , "se" , "sample_size" ]).all ()
63
+ assert (
64
+ df .columns .values
65
+ == [
66
+ "geo_id" ,
67
+ "val" ,
68
+ "se" ,
69
+ "sample_size" ,
70
+ "missing_val" ,
71
+ "missing_se" ,
72
+ "missing_sample_size" ,
73
+ ]
74
+ ).all ()
75
+
76
+ def test_add_nancodes (self ):
77
+ df = pd .DataFrame ({
78
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
79
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
80
+ "val" : [0.1 , 0.2 , 0.3 , 0.4 , 0.5 , np .nan , 0.7 , np .nan ],
81
+ "se" : [np .nan ] * 8 ,
82
+ "sample_size" : [np .nan ] * 8
83
+ }).set_index (["timestamp" , "geo_id" ])
84
+ expected_df = pd .DataFrame ({
85
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
86
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
87
+ "val" : [0.1 , 0.2 , 0.3 , 0.4 , 0.5 , np .nan , 0.7 , np .nan ],
88
+ "se" : [np .nan ] * 8 ,
89
+ "sample_size" : [np .nan ] * 8 ,
90
+ "missing_val" : [Nans .NOT_MISSING ] * 5 + [Nans .REGION_EXCEPTION , Nans .NOT_MISSING , Nans .UNKNOWN ],
91
+ "missing_se" : [Nans .NOT_APPLICABLE ] * 8 ,
92
+ "missing_sample_size" : [Nans .NOT_APPLICABLE ] * 8 ,
93
+ }).set_index (["timestamp" , "geo_id" ])
94
+
95
+ pd .testing .assert_frame_equal (add_nancodes (df , "deaths" , "county" , None ), expected_df )
96
+
97
+ df2 = pd .DataFrame ({
98
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
99
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
100
+ "val" : [np .nan ] * 6 + [0.7 , np .nan ],
101
+ "se" : [np .nan ] * 8 ,
102
+ "sample_size" : [np .nan ] * 8
103
+ }).set_index (["timestamp" , "geo_id" ])
104
+ expected_df2 = pd .DataFrame ({
105
+ "timestamp" : pd .date_range ("20200321" , "20200328" ),
106
+ "geo_id" : ["01017" , "01043" , "01061" , "01103" , "02282" , "72001" , "31000" , "49000" ],
107
+ "val" : [np .nan ] * 6 + [0.7 , np .nan ],
108
+ "se" : [np .nan ] * 8 ,
109
+ "sample_size" : [np .nan ] * 8 ,
110
+ "missing_val" : [Nans .PRIVACY ] * 5 + [Nans .REGION_EXCEPTION , Nans .NOT_MISSING , Nans .UNKNOWN ],
111
+ "missing_se" : [Nans .NOT_APPLICABLE ] * 8 ,
112
+ "missing_sample_size" : [Nans .NOT_APPLICABLE ] * 8 ,
113
+ }).set_index (["timestamp" , "geo_id" ])
114
+
115
+ pd .testing .assert_frame_equal (add_nancodes (df2 , "deaths" , "county" , "seven_day_average" ), expected_df2 )
0 commit comments