2
2
from io import StringIO , BytesIO
3
3
from os import listdir , mkdir
4
4
from os .path import join
5
+ from typing import List
5
6
6
7
from boto3 import Session
7
8
from git import Repo , exc
19
20
CSV_DTYPES = {
20
21
"geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
21
22
"missing_val" : int , "missing_se" : int , "missing_sample_size" : int
22
- }
23
+ }
24
+ CSV_DTYPES_OLD = {
25
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float
26
+ }
23
27
24
28
CSVS_BEFORE = {
25
29
# All rows unchanged
74
78
"missing_sample_size" : [Nans .NOT_MISSING ],
75
79
}),
76
80
77
- # All rows common, but no missing columns
81
+ # Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
82
+ # and present again as nan, row 5 deleted previously and absent from file,
83
+ # row 6 deleted previously, but new missing has different NA code
84
+ # (missing columns present)
78
85
"csv6" : pd .DataFrame ({
79
- "geo_id" : ["1" ],
80
- "val" : [1.0 ],
81
- "se" : [0.1 ],
82
- "sample_size" : [10.0 ]
83
- }),
84
-
85
- # Row deleted and row added, but no missing columns (will not be uploaded)
86
+ "geo_id" : ["1" , "2" , "4" , "5" , "6" ],
87
+ "val" : [1.0 , 2.0 , np .nan , np .nan , np .nan ],
88
+ "se" : [0.1 , 0.2 , np .nan , np .nan , np .nan ],
89
+ "sample_size" : [10.0 , 20.0 , np .nan , np .nan , np .nan ],
90
+ "missing_val" : [Nans .NOT_MISSING ] * 2 + [Nans .DELETED ] * 3 ,
91
+ "missing_se" : [Nans .NOT_MISSING ] * 2 + [Nans .DELETED ] * 3 ,
92
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 2 + [Nans .DELETED ] * 3 ,
93
+ }),
94
+
95
+ # Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
96
+ # and present again as nan, row 5 deleted previously and absent from file
97
+ # (no missing columns)
86
98
"csv7" : pd .DataFrame ({
87
- "geo_id" : ["1" , "2" ],
88
- "val" : [1.0 , 2.0 ],
89
- "se" : [0.1 , 0.2 ],
90
- "sample_size" : [10.0 , 20.0 ]
91
- }),
92
-
93
- # Row deleted and row added, but no missing columns
94
- "csv8" : pd .DataFrame ({
95
- "geo_id" : ["1" , "2" ],
96
- "val" : [1.0 , 2.0 ],
97
- "se" : [0.1 , 0.2 ],
98
- "sample_size" : [10.0 , 20.0 ]
99
- }),
99
+ "geo_id" : ["1" , "2" , "4" , "5" ],
100
+ "val" : [1.0 , 2.0 , np .nan , np .nan ],
101
+ "se" : [0.1 , 0.2 , np .nan , np .nan ],
102
+ "sample_size" : [10.0 , 20.0 , np .nan , np .nan ]
103
+ })
100
104
}
101
105
102
106
CSVS_AFTER = {
152
156
"sample_size" : [10.0 ]
153
157
}),
154
158
155
- # All rows common, but no missing columns
159
+ # Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
160
+ # and present again as nan, row 5 deleted previously and absent from file,
161
+ # row 6 deleted previously, but new missing has different NA code
162
+ # (missing columns present)
156
163
"csv6" : pd .DataFrame ({
157
- "geo_id" : ["1" ],
158
- "val" : [1.0 ],
159
- "se" : [0.1 ],
160
- "sample_size" : [10.0 ]
161
- }),
162
-
163
- # Row deleted and row added, but no missing columns (will not be uploaded)
164
+ "geo_id" : ["1" , "3" , "4" , "6" ],
165
+ "val" : [1.0 , 3.0 , np .nan , np .nan ],
166
+ "se" : [0.1 , 0.3 , np .nan , np .nan ],
167
+ "sample_size" : [10.0 , 30.0 , np .nan , np .nan ],
168
+ "missing_val" : [Nans .NOT_MISSING ] * 2 + [Nans .DELETED ] + [Nans .CENSORED ],
169
+ "missing_se" : [Nans .NOT_MISSING ] * 2 + [Nans .DELETED ] + [Nans .CENSORED ],
170
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 2 + [Nans .DELETED ] + [Nans .CENSORED ],
171
+ }),
172
+
173
+ # Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
174
+ # and present again as nan, row 5 deleted previously and absent from file
175
+ # (no missing columns)
164
176
"csv7" : pd .DataFrame ({
165
- "geo_id" : ["1" ],
166
- "val" : [1.0 ],
167
- "se" : [0.1 ],
168
- "sample_size" : [10.0 ]
169
- }),
170
-
171
- # Row deleted and row added, but no missing columns
172
- "csv8" : pd .DataFrame ({
173
- "geo_id" : ["1" , "3" ],
174
- "val" : [1.0 , 3.0 ],
175
- "se" : [0.1 , 0.3 ],
176
- "sample_size" : [10.0 , 30.0 ]
177
- }),
177
+ "geo_id" : ["1" , "3" , "4" ],
178
+ "val" : [1.0 , 3.0 , np .nan ],
179
+ "se" : [0.1 , 0.3 , np .nan ],
180
+ "sample_size" : [10.0 , 30.0 , np .nan ]
181
+ })
178
182
}
179
183
184
+ import shutil
185
+ try :
186
+ shutil .rmtree ("/tmp/cache" )
187
+ shutil .rmtree ("/tmp/export" )
188
+ except :
189
+ ...
190
+
191
+ def _assert_frames_equal_ignore_row_order (df1 , df2 , index_cols : List [str ] = None ):
192
+ return assert_frame_equal (df1 .set_index (index_cols ).sort_index (), df2 .set_index (index_cols ).sort_index ())
193
+
180
194
class TestArchiveDiffer :
181
195
182
196
def test_stubs (self ):
@@ -194,14 +208,32 @@ def test_diff_and_filter_exports(self, tmp_path):
194
208
mkdir (cache_dir )
195
209
mkdir (export_dir )
196
210
197
- csv1_diff = pd .DataFrame ({
198
- "geo_id" : ["3" , "2" , "4" ],
199
- "val" : [np .nan , 2.1 , 4.0 ],
200
- "se" : [np .nan , 0.21 , np .nan ],
201
- "sample_size" : [np .nan , 21.0 , 40.0 ],
202
- "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
203
- "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
204
- "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
211
+ expected_csv1_diff = pd .DataFrame ({
212
+ "geo_id" : ["2" , "3" , "4" ],
213
+ "val" : [2.1 , np .nan , 4.0 ],
214
+ "se" : [0.21 , np .nan , np .nan ],
215
+ "sample_size" : [21.0 , np .nan , 40.0 ],
216
+ "missing_val" : [Nans .NOT_MISSING , Nans .DELETED , Nans .NOT_MISSING ],
217
+ "missing_se" : [Nans .NOT_MISSING , Nans .DELETED , Nans .NOT_MISSING ],
218
+ "missing_sample_size" : [Nans .NOT_MISSING , Nans .DELETED , Nans .NOT_MISSING ],
219
+ })
220
+ expected_csv3 = CSVS_AFTER ["csv3" ]
221
+ expected_csv4_diff = CSVS_AFTER ["csv4" ]
222
+ expected_csv5_diff = CSVS_AFTER ["csv5" ]
223
+ expected_csv6_diff = pd .DataFrame ({
224
+ "geo_id" : ["2" , "3" , "6" ],
225
+ "val" : [np .nan , 3.0 , np .nan ],
226
+ "se" : [np .nan , 0.3 , np .nan ],
227
+ "sample_size" : [np .nan , 30.0 , np .nan ],
228
+ "missing_val" : [Nans .DELETED , Nans .NOT_MISSING , Nans .CENSORED ],
229
+ "missing_se" : [Nans .DELETED , Nans .NOT_MISSING , Nans .CENSORED ],
230
+ "missing_sample_size" : [Nans .DELETED , Nans .NOT_MISSING , Nans .CENSORED ],
231
+ })
232
+ expected_csv7_diff = pd .DataFrame ({
233
+ "geo_id" : ["2" , "3" ],
234
+ "val" : [np .nan , 3.0 ],
235
+ "se" : [np .nan , 0.3 ],
236
+ "sample_size" : [np .nan , 30.0 ]
205
237
})
206
238
207
239
arch_diff = ArchiveDiffer (cache_dir , export_dir )
@@ -225,7 +257,7 @@ def test_diff_and_filter_exports(self, tmp_path):
225
257
# Check return values
226
258
assert set (deleted_files ) == {join (cache_dir , "csv2.csv" )}
227
259
assert set (common_diffs .keys ()) == {
228
- join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" , "csv6.csv" , "csv7.csv" , "csv8.csv" ]}
260
+ join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" , "csv6.csv" , "csv7.csv" ]}
229
261
assert set (new_files ) == {join (export_dir , "csv3.csv" )}
230
262
assert common_diffs [join (export_dir , "csv0.csv" )] is None
231
263
assert common_diffs [join (export_dir , "csv1.csv" )] == join (
@@ -238,13 +270,34 @@ def test_diff_and_filter_exports(self, tmp_path):
238
270
"csv3.csv" ,
239
271
"csv4.csv" , "csv4.csv.diff" ,
240
272
"csv5.csv" , "csv5.csv.diff" ,
241
- "csv6.csv" ,
273
+ "csv6.csv" , "csv6.csv.diff" ,
242
274
"csv7.csv" , "csv7.csv.diff" ,
243
- "csv8.csv" , "csv8.csv.diff"
244
275
}
245
- assert_frame_equal (
276
+ _assert_frames_equal_ignore_row_order (
246
277
pd .read_csv (join (export_dir , "csv1.csv.diff" ), dtype = CSV_DTYPES ),
247
- csv1_diff )
278
+ expected_csv1_diff ,
279
+ index_cols = ["geo_id" ]
280
+ )
281
+ _assert_frames_equal_ignore_row_order (
282
+ pd .read_csv (join (export_dir , "csv4.csv.diff" ), dtype = CSV_DTYPES_OLD ),
283
+ expected_csv4_diff ,
284
+ index_cols = ["geo_id" ]
285
+ )
286
+ _assert_frames_equal_ignore_row_order (
287
+ pd .read_csv (join (export_dir , "csv5.csv.diff" ), dtype = CSV_DTYPES_OLD ),
288
+ expected_csv5_diff ,
289
+ index_cols = ["geo_id" ]
290
+ )
291
+ _assert_frames_equal_ignore_row_order (
292
+ pd .read_csv (join (export_dir , "csv6.csv.diff" ), dtype = CSV_DTYPES_OLD ),
293
+ expected_csv6_diff ,
294
+ index_cols = ["geo_id" ]
295
+ )
296
+ _assert_frames_equal_ignore_row_order (
297
+ pd .read_csv (join (export_dir , "csv7.csv.diff" ), dtype = CSV_DTYPES_OLD ),
298
+ expected_csv7_diff ,
299
+ index_cols = ["geo_id" ]
300
+ )
248
301
249
302
# Test filter_exports
250
303
# ===================
@@ -259,10 +312,37 @@ def test_diff_and_filter_exports(self, tmp_path):
259
312
arch_diff .filter_exports (common_diffs )
260
313
261
314
# Check exports directory just has incremental changes
262
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7 .csv" , "csv8 .csv" }
263
- assert_frame_equal (
315
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv6 .csv" , "csv7 .csv" }
316
+ _assert_frames_equal_ignore_row_order (
264
317
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
265
- csv1_diff )
318
+ expected_csv1_diff ,
319
+ index_cols = ["geo_id" ]
320
+ )
321
+ _assert_frames_equal_ignore_row_order (
322
+ pd .read_csv (join (export_dir , "csv3.csv" ), dtype = CSV_DTYPES ),
323
+ expected_csv3 ,
324
+ index_cols = ["geo_id" ]
325
+ )
326
+ _assert_frames_equal_ignore_row_order (
327
+ pd .read_csv (join (export_dir , "csv4.csv" ), dtype = CSV_DTYPES ),
328
+ expected_csv4_diff ,
329
+ index_cols = ["geo_id" ]
330
+ )
331
+ _assert_frames_equal_ignore_row_order (
332
+ pd .read_csv (join (export_dir , "csv5.csv" ), dtype = CSV_DTYPES ),
333
+ expected_csv5_diff ,
334
+ index_cols = ["geo_id" ]
335
+ )
336
+ _assert_frames_equal_ignore_row_order (
337
+ pd .read_csv (join (export_dir , "csv6.csv" ), dtype = CSV_DTYPES ),
338
+ expected_csv6_diff ,
339
+ index_cols = ["geo_id" ]
340
+ )
341
+ _assert_frames_equal_ignore_row_order (
342
+ pd .read_csv (join (export_dir , "csv7.csv" ), dtype = CSV_DTYPES ),
343
+ expected_csv7_diff ,
344
+ index_cols = ["geo_id" ]
345
+ )
266
346
267
347
268
348
AWS_CREDENTIALS = {
@@ -384,7 +464,7 @@ def test_run(self, tmp_path, s3_client):
384
464
assert_frame_equal (pd .read_csv (body , dtype = CSV_DTYPES ), df )
385
465
386
466
# Check exports directory just has incremental changes
387
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7 .csv" , "csv8 .csv" }
467
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv6 .csv" , "csv7 .csv" }
388
468
csv1_diff = pd .DataFrame ({
389
469
"geo_id" : ["3" , "2" , "4" ],
390
470
"val" : [np .nan , 2.1 , 4.0 ],
@@ -394,9 +474,11 @@ def test_run(self, tmp_path, s3_client):
394
474
"missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
395
475
"missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
396
476
})
397
- assert_frame_equal (
477
+ _assert_frames_equal_ignore_row_order (
398
478
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
399
- csv1_diff )
479
+ csv1_diff ,
480
+ index_cols = ["geo_id" ]
481
+ )
400
482
401
483
402
484
class TestGitArchiveDiffer :
@@ -596,7 +678,7 @@ def test_run(self, tmp_path):
596
678
original_branch .checkout ()
597
679
598
680
# Check exports directory just has incremental changes
599
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv7 .csv" , "csv8 .csv" }
681
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" , "csv6 .csv" , "csv7 .csv" }
600
682
csv1_diff = pd .DataFrame ({
601
683
"geo_id" : ["3" , "2" , "4" ],
602
684
"val" : [np .nan , 2.1 , 4.0 ],
@@ -606,9 +688,11 @@ def test_run(self, tmp_path):
606
688
"missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
607
689
"missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
608
690
})
609
- assert_frame_equal (
691
+ _assert_frames_equal_ignore_row_order (
610
692
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
611
- csv1_diff )
693
+ csv1_diff ,
694
+ index_cols = ["geo_id" ]
695
+ )
612
696
613
697
614
698
class TestFromParams :
0 commit comments