18
18
19
19
CSV_DTYPES = {
20
20
"geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
21
- "missing_val" : int , "missing_se" :int , "missing_sample_size" : int
21
+ "missing_val" : int , "missing_se" : int , "missing_sample_size" : int
22
22
}
23
23
24
24
CSVS_BEFORE = {
25
- # Common
25
+ # All rows unchanged
26
26
"csv0" : pd .DataFrame ({
27
27
"geo_id" : ["1" , "2" , "3" ],
28
28
"val" : [1.000000001 , 2.00000002 , 3.00000003 ],
33
33
"missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
34
34
}),
35
35
36
+ # One row deleted and one row added
36
37
"csv1" : pd .DataFrame ({
37
38
"geo_id" : ["1" , "2" , "3" ],
38
39
"val" : [1.0 , 2.0 , 3.0 ],
43
44
"missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
44
45
}),
45
46
46
- # Deleted
47
+ # File deleted
47
48
"csv2" : pd .DataFrame ({
48
49
"geo_id" : ["1" ],
49
50
"val" : [1.0 ],
54
55
"missing_sample_size" : [Nans .NOT_MISSING ],
55
56
}),
56
57
57
- # Common , but updated with missing columns
58
+ # All rows common , but missing columns added
58
59
"csv4" : pd .DataFrame ({
59
60
"geo_id" : ["1" ],
60
61
"val" : [1.0 ],
61
62
"se" : [0.1 ],
62
63
"sample_size" : [10.0 ]
63
64
}),
64
65
65
- # Common , but missing columns removed
66
+ # All rows common , but missing columns removed
66
67
"csv5" : pd .DataFrame ({
67
68
"geo_id" : ["1" ],
68
69
"val" : [1.0 ],
72
73
"missing_se" : [Nans .NOT_MISSING ],
73
74
"missing_sample_size" : [Nans .NOT_MISSING ],
74
75
}),
76
+
77
+ # All rows common, but no missing columns
78
+ "csv6" : pd .DataFrame ({
79
+ "geo_id" : ["1" ],
80
+ "val" : [1.0 ],
81
+ "se" : [0.1 ],
82
+ "sample_size" : [10.0 ]
83
+ }),
84
+
85
+ # Row deleted and row added, but no missing columns (will not be uploaded)
86
+ "csv7" : pd .DataFrame ({
87
+ "geo_id" : ["1" , "2" ],
88
+ "val" : [1.0 , 2.0 ],
89
+ "se" : [0.1 , 0.2 ],
90
+ "sample_size" : [10.0 , 20.0 ]
91
+ }),
92
+
93
+ # Row deleted and row added, but no missing columns
94
+ "csv8" : pd .DataFrame ({
95
+ "geo_id" : ["1" , "2" ],
96
+ "val" : [1.0 , 2.0 ],
97
+ "se" : [0.1 , 0.2 ],
98
+ "sample_size" : [10.0 , 20.0 ]
99
+ }),
75
100
}
76
101
77
102
CSVS_AFTER = {
78
- # Common
103
+ # All rows unchanged
79
104
"csv0" : pd .DataFrame ({
80
105
"geo_id" : ["1" , "2" , "3" ],
81
106
"val" : [1.0 , 2.0 , 3.0 ],
86
111
"missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
87
112
}),
88
113
114
+ # One row deleted and one row added
89
115
"csv1" : pd .DataFrame ({
90
116
"geo_id" : ["1" , "2" , "4" ],
91
117
"val" : [1.0 , 2.1 , 4.0 ],
96
122
"missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
97
123
}),
98
124
99
- # Added
125
+ # File added
100
126
"csv3" : pd .DataFrame ({
101
127
"geo_id" : ["2" ],
102
128
"val" : [2.0000002 ],
107
133
"missing_sample_size" : [Nans .NOT_MISSING ],
108
134
}),
109
135
110
- # Common , but updated with missing columns
136
+ # All rows common , but missing columns added
111
137
"csv4" : pd .DataFrame ({
112
138
"geo_id" : ["1" ],
113
139
"val" : [1.0 ],
118
144
"missing_sample_size" : [Nans .NOT_MISSING ],
119
145
}),
120
146
121
- # Common , but missing columns removed
147
+ # All rows common , but missing columns removed
122
148
"csv5" : pd .DataFrame ({
123
149
"geo_id" : ["1" ],
124
150
"val" : [1.0 ],
125
151
"se" : [0.1 ],
126
152
"sample_size" : [10.0 ]
127
153
}),
154
+
155
+ # All rows common, but no missing columns
156
+ "csv6" : pd .DataFrame ({
157
+ "geo_id" : ["1" ],
158
+ "val" : [1.0 ],
159
+ "se" : [0.1 ],
160
+ "sample_size" : [10.0 ]
161
+ }),
162
+
163
+ # Row deleted and row added, but no missing columns (will not be uploaded)
164
+ "csv7" : pd .DataFrame ({
165
+ "geo_id" : ["1" ],
166
+ "val" : [1.0 ],
167
+ "se" : [0.1 ],
168
+ "sample_size" : [10.0 ]
169
+ }),
170
+
171
+ # Row deleted and row added, but no missing columns
172
+ "csv8" : pd .DataFrame ({
173
+ "geo_id" : ["1" , "3" ],
174
+ "val" : [1.0 , 3.0 ],
175
+ "se" : [0.1 , 0.3 ],
176
+ "sample_size" : [10.0 , 30.0 ]
177
+ }),
128
178
}
129
179
130
180
class TestArchiveDiffer :
@@ -175,17 +225,22 @@ def test_diff_and_filter_exports(self, tmp_path):
175
225
# Check return values
176
226
assert set (deleted_files ) == {join (cache_dir , "csv2.csv" )}
177
227
assert set (common_diffs .keys ()) == {
178
- join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" ]}
228
+ join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" , "csv6.csv" , "csv7.csv" , "csv8.csv" ]}
179
229
assert set (new_files ) == {join (export_dir , "csv3.csv" )}
180
230
assert common_diffs [join (export_dir , "csv0.csv" )] is None
181
231
assert common_diffs [join (export_dir , "csv1.csv" )] == join (
182
232
export_dir , "csv1.csv.diff" )
183
233
184
234
# Check filesystem for actual files
185
235
assert set (listdir (export_dir )) == {
186
- "csv0.csv" , "csv1.csv" , "csv1.csv.diff" ,
187
- "csv3.csv" , "csv4.csv" , "csv4.csv.diff" ,
188
- "csv5.csv" , "csv5.csv.diff"
236
+ "csv0.csv" ,
237
+ "csv1.csv" , "csv1.csv.diff" ,
238
+ "csv3.csv" ,
239
+ "csv4.csv" , "csv4.csv.diff" ,
240
+ "csv5.csv" , "csv5.csv.diff" ,
241
+ "csv6.csv" ,
242
+ "csv7.csv" , "csv7.csv.diff" ,
243
+ "csv8.csv" , "csv8.csv.diff"
189
244
}
190
245
assert_frame_equal (
191
246
pd .read_csv (join (export_dir , "csv1.csv.diff" ), dtype = CSV_DTYPES ),
@@ -204,7 +259,7 @@ def test_diff_and_filter_exports(self, tmp_path):
204
259
arch_diff .filter_exports (common_diffs )
205
260
206
261
# Check exports directory just has incremental changes
207
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
262
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7.csv" , "csv8.csv" }
208
263
assert_frame_equal (
209
264
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
210
265
csv1_diff )
@@ -325,13 +380,11 @@ def test_run(self, tmp_path, s3_client):
325
380
326
381
# Check that the buckets now contain the exported files.
327
382
for csv_name , df in CSVS_AFTER .items ():
328
- body = s3_client .get_object (
329
- Bucket = self .bucket_name ,
330
- Key = f"{ self .indicator_prefix } /{ csv_name } .csv" )["Body" ]
383
+ body = s3_client .get_object (Bucket = self .bucket_name , Key = f"{ self .indicator_prefix } /{ csv_name } .csv" )["Body" ]
331
384
assert_frame_equal (pd .read_csv (body , dtype = CSV_DTYPES ), df )
332
385
333
386
# Check exports directory just has incremental changes
334
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
387
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7.csv" , "csv8.csv" }
335
388
csv1_diff = pd .DataFrame ({
336
389
"geo_id" : ["3" , "2" , "4" ],
337
390
"val" : [np .nan , 2.1 , 4.0 ],
@@ -539,12 +592,11 @@ def test_run(self, tmp_path):
539
592
arch_diff .get_branch (branch_name ).checkout ()
540
593
for csv_name , df in CSVS_AFTER .items ():
541
594
assert_frame_equal (
542
- pd .read_csv (
543
- join (cache_dir , f"{ csv_name } .csv" ), dtype = CSV_DTYPES ), df )
595
+ pd .read_csv (join (cache_dir , f"{ csv_name } .csv" ), dtype = CSV_DTYPES ), df )
544
596
original_branch .checkout ()
545
597
546
598
# Check exports directory just has incremental changes
547
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
599
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7.csv" , "csv8.csv" }
548
600
csv1_diff = pd .DataFrame ({
549
601
"geo_id" : ["3" , "2" , "4" ],
550
602
"val" : [np .nan , 2.1 , 4.0 ],
0 commit comments