14
14
15
15
from delphi_utils .archive import ArchiveDiffer , GitArchiveDiffer , S3ArchiveDiffer ,\
16
16
archiver_from_params
17
+ from delphi_utils import Nans
17
18
18
- CSV_DTYPES = {"geo_id" : str , "val" : float , "se" : float , "sample_size" : float }
19
+ CSV_DTYPES = {
20
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
21
+ "missing_val" : int , "missing_se" :int , "missing_sample_size" : int
22
+ }
19
23
20
24
CSVS_BEFORE = {
21
25
# Common
22
26
"csv0" : pd .DataFrame ({
23
27
"geo_id" : ["1" , "2" , "3" ],
24
28
"val" : [1.000000001 , 2.00000002 , 3.00000003 ],
25
29
"se" : [0.1 , 0.2 , 0.3 ],
26
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
30
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
31
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
32
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
33
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
34
+ }),
27
35
28
36
"csv1" : pd .DataFrame ({
29
37
"geo_id" : ["1" , "2" , "3" ],
30
38
"val" : [1.0 , 2.0 , 3.0 ],
31
39
"se" : [np .nan , 0.20000002 , 0.30000003 ],
32
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
40
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
41
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
42
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
43
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
44
+ }),
33
45
34
46
# Deleted
35
47
"csv2" : pd .DataFrame ({
36
48
"geo_id" : ["1" ],
37
49
"val" : [1.0 ],
38
50
"se" : [0.1 ],
39
- "sample_size" : [10.0 ]}),
51
+ "sample_size" : [10.0 ],
52
+ "missing_val" : [Nans .NOT_MISSING ],
53
+ "missing_se" : [Nans .NOT_MISSING ],
54
+ "missing_sample_size" : [Nans .NOT_MISSING ],
55
+ }),
56
+
57
+ # Common, but updated with missing columns
58
+ "csv4" : pd .DataFrame ({
59
+ "geo_id" : ["1" ],
60
+ "val" : [1.0 ],
61
+ "se" : [0.1 ],
62
+ "sample_size" : [10.0 ]
63
+ }),
40
64
}
41
65
42
66
CSVS_AFTER = {
45
69
"geo_id" : ["1" , "2" , "3" ],
46
70
"val" : [1.0 , 2.0 , 3.0 ],
47
71
"se" : [0.10000001 , 0.20000002 , 0.30000003 ],
48
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
72
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
73
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
74
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
75
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
76
+ }),
49
77
50
78
"csv1" : pd .DataFrame ({
51
79
"geo_id" : ["1" , "2" , "4" ],
52
80
"val" : [1.0 , 2.1 , 4.0 ],
53
81
"se" : [np .nan , 0.21 , np .nan ],
54
- "sample_size" : [10.0 , 21.0 , 40.0 ]}),
82
+ "sample_size" : [10.0 , 21.0 , 40.0 ],
83
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
84
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
85
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
86
+ }),
55
87
56
88
# Added
57
89
"csv3" : pd .DataFrame ({
58
90
"geo_id" : ["2" ],
59
91
"val" : [2.0000002 ],
60
92
"se" : [0.2 ],
61
- "sample_size" : [20.0 ]}),
93
+ "sample_size" : [20.0 ],
94
+ "missing_val" : [Nans .NOT_MISSING ],
95
+ "missing_se" : [Nans .NOT_MISSING ],
96
+ "missing_sample_size" : [Nans .NOT_MISSING ],
97
+ }),
98
+
99
+ # Common, but updated with missing columns
100
+ "csv4" : pd .DataFrame ({
101
+ "geo_id" : ["1" ],
102
+ "val" : [1.0 ],
103
+ "se" : [0.1 ],
104
+ "sample_size" : [10.0 ],
105
+ "missing_val" : [Nans .NOT_MISSING ],
106
+ "missing_se" : [Nans .NOT_MISSING ],
107
+ "missing_sample_size" : [Nans .NOT_MISSING ],
108
+ }),
62
109
}
63
110
64
-
65
111
class TestArchiveDiffer :
66
112
67
113
def test_stubs (self ):
@@ -80,10 +126,14 @@ def test_diff_and_filter_exports(self, tmp_path):
80
126
mkdir (export_dir )
81
127
82
128
csv1_diff = pd .DataFrame ({
83
- "geo_id" : ["2" , "4" ],
84
- "val" : [2.1 , 4.0 ],
85
- "se" : [0.21 , np .nan ],
86
- "sample_size" : [21.0 , 40.0 ]})
129
+ "geo_id" : ["3" , "2" , "4" ],
130
+ "val" : [np .nan , 2.1 , 4.0 ],
131
+ "se" : [np .nan , 0.21 , np .nan ],
132
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
133
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
134
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
135
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
136
+ })
87
137
88
138
arch_diff = ArchiveDiffer (cache_dir , export_dir )
89
139
@@ -106,15 +156,15 @@ def test_diff_and_filter_exports(self, tmp_path):
106
156
# Check return values
107
157
assert set (deleted_files ) == {join (cache_dir , "csv2.csv" )}
108
158
assert set (common_diffs .keys ()) == {
109
- join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" ]}
159
+ join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" ]}
110
160
assert set (new_files ) == {join (export_dir , "csv3.csv" )}
111
161
assert common_diffs [join (export_dir , "csv0.csv" )] is None
112
162
assert common_diffs [join (export_dir , "csv1.csv" )] == join (
113
163
export_dir , "csv1.csv.diff" )
114
164
115
165
# Check filesystem for actual files
116
166
assert set (listdir (export_dir )) == {
117
- "csv0.csv" , "csv1.csv" , "csv1.csv.diff" , "csv3.csv" }
167
+ "csv0.csv" , "csv1.csv" , "csv1.csv.diff" , "csv3.csv" , "csv4.csv" , "csv4.csv.diff" }
118
168
assert_frame_equal (
119
169
pd .read_csv (join (export_dir , "csv1.csv.diff" ), dtype = CSV_DTYPES ),
120
170
csv1_diff )
@@ -132,7 +182,7 @@ def test_diff_and_filter_exports(self, tmp_path):
132
182
arch_diff .filter_exports (common_diffs )
133
183
134
184
# Check exports directory just has incremental changes
135
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
185
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" }
136
186
assert_frame_equal (
137
187
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
138
188
csv1_diff )
@@ -259,12 +309,16 @@ def test_run(self, tmp_path, s3_client):
259
309
assert_frame_equal (pd .read_csv (body , dtype = CSV_DTYPES ), df )
260
310
261
311
# Check exports directory just has incremental changes
262
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
312
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" }
263
313
csv1_diff = pd .DataFrame ({
264
- "geo_id" : ["2" , "4" ],
265
- "val" : [2.1 , 4.0 ],
266
- "se" : [0.21 , np .nan ],
267
- "sample_size" : [21.0 , 40.0 ]})
314
+ "geo_id" : ["3" , "2" , "4" ],
315
+ "val" : [np .nan , 2.1 , 4.0 ],
316
+ "se" : [np .nan , 0.21 , np .nan ],
317
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
318
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
319
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
320
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
321
+ })
268
322
assert_frame_equal (
269
323
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
270
324
csv1_diff )
@@ -346,7 +400,11 @@ def test_diff_exports(self, tmp_path):
346
400
"geo_id" : ["1" , "2" , "3" ],
347
401
"val" : [1.0 , 2.0 , 3.0 ],
348
402
"se" : [0.1 , 0.2 , 0.3 ],
349
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
403
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
404
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
405
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
406
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
407
+ })
350
408
351
409
# Write exact same CSV into cache and export, so no diffs expected
352
410
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -383,7 +441,11 @@ def test_archive_exports(self, tmp_path):
383
441
"geo_id" : ["1" , "2" , "3" ],
384
442
"val" : [1.0 , 2.0 , 3.0 ],
385
443
"se" : [0.1 , 0.2 , 0.3 ],
386
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
444
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
445
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
446
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
447
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
448
+ })
387
449
388
450
# csv1.csv is now a dirty edit in the repo, and to be exported too
389
451
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -460,12 +522,16 @@ def test_run(self, tmp_path):
460
522
original_branch .checkout ()
461
523
462
524
# Check exports directory just has incremental changes
463
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
525
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" }
464
526
csv1_diff = pd .DataFrame ({
465
- "geo_id" : ["2" , "4" ],
466
- "val" : [2.1 , 4.0 ],
467
- "se" : [0.21 , np .nan ],
468
- "sample_size" : [21.0 , 40.0 ]})
527
+ "geo_id" : ["3" , "2" , "4" ],
528
+ "val" : [np .nan , 2.1 , 4.0 ],
529
+ "se" : [np .nan , 0.21 , np .nan ],
530
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
531
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
532
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
533
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
534
+ })
469
535
assert_frame_equal (
470
536
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
471
537
csv1_diff )
0 commit comments