Skip to content

Commit 7248d69

Browse files
committed
Fix archiver bug treating old deletions as new deletions, add many tests
1 parent fb3d70a commit 7248d69

File tree

2 files changed

+160
-65
lines changed

2 files changed

+160
-65
lines changed

_delphi_utils_python/delphi_utils/archive.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,17 @@ def diff_export_csv(
107107
deleted_df[["val", "se", "sample_size"]] = np.nan
108108
if "missing_val" in after_df_cmn.columns:
109109
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
110+
111+
# Remove deleted entries that were already present
112+
if deleted_idx.size > 0:
113+
deleted_same_mask = deleted_df == before_df.loc[deleted_idx, :]
114+
deleted_same_mask |= pd.isna(deleted_df) & pd.isna(before_df.loc[deleted_idx, :])
115+
deleted_df = deleted_df.loc[~(deleted_same_mask.all(axis=1)), :]
116+
117+
# If the new file has no missing columns, then we should remove them from
118+
# the deletions too
119+
if "missing_val" not in after_df_cmn.columns:
120+
deleted_df = deleted_df[["val", "se", "sample_size"]]
110121

111122
return (
112123
deleted_df,

_delphi_utils_python/tests/test_archive.py

Lines changed: 149 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from io import StringIO, BytesIO
33
from os import listdir, mkdir
44
from os.path import join
5+
from typing import List
56

67
from boto3 import Session
78
from git import Repo, exc
@@ -19,7 +20,10 @@
1920
CSV_DTYPES = {
2021
"geo_id": str, "val": float, "se": float, "sample_size": float,
2122
"missing_val": int, "missing_se": int, "missing_sample_size": int
22-
}
23+
}
24+
CSV_DTYPES_OLD = {
25+
"geo_id": str, "val": float, "se": float, "sample_size": float
26+
}
2327

2428
CSVS_BEFORE = {
2529
# All rows unchanged
@@ -74,29 +78,29 @@
7478
"missing_sample_size": [Nans.NOT_MISSING],
7579
}),
7680

77-
# All rows common, but no missing columns
81+
# Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
82+
# and present again as nan, row 5 deleted previously and absent from file,
83+
# row 6 deleted previously, but new missing has different NA code
84+
# (missing columns present)
7885
"csv6": pd.DataFrame({
79-
"geo_id": ["1"],
80-
"val": [1.0],
81-
"se": [0.1],
82-
"sample_size": [10.0]
83-
}),
84-
85-
# Row deleted and row added, but no missing columns (will not be uploaded)
86+
"geo_id": ["1", "2", "4", "5", "6"],
87+
"val": [1.0, 2.0, np.nan, np.nan, np.nan],
88+
"se": [0.1, 0.2, np.nan, np.nan, np.nan],
89+
"sample_size": [10.0, 20.0, np.nan, np.nan, np.nan],
90+
"missing_val": [Nans.NOT_MISSING] * 2 + [Nans.DELETED] * 3,
91+
"missing_se": [Nans.NOT_MISSING] * 2 + [Nans.DELETED] * 3,
92+
"missing_sample_size": [Nans.NOT_MISSING] * 2 + [Nans.DELETED] * 3,
93+
}),
94+
95+
# Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
96+
# and present again as nan, row 5 deleted previously and absent from file
97+
# (no missing columns)
8698
"csv7": pd.DataFrame({
87-
"geo_id": ["1", "2"],
88-
"val": [1.0, 2.0],
89-
"se": [0.1, 0.2],
90-
"sample_size": [10.0, 20.0]
91-
}),
92-
93-
# Row deleted and row added, but no missing columns
94-
"csv8": pd.DataFrame({
95-
"geo_id": ["1", "2"],
96-
"val": [1.0, 2.0],
97-
"se": [0.1, 0.2],
98-
"sample_size": [10.0, 20.0]
99-
}),
99+
"geo_id": ["1", "2", "4", "5"],
100+
"val": [1.0, 2.0, np.nan, np.nan],
101+
"se": [0.1, 0.2, np.nan, np.nan],
102+
"sample_size": [10.0, 20.0, np.nan, np.nan]
103+
})
100104
}
101105

102106
CSVS_AFTER = {
@@ -152,31 +156,41 @@
152156
"sample_size": [10.0]
153157
}),
154158

155-
# All rows common, but no missing columns
159+
# Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
160+
# and present again as nan, row 5 deleted previously and absent from file,
161+
# row 6 deleted previously, but new missing has different NA code
162+
# (missing columns present)
156163
"csv6": pd.DataFrame({
157-
"geo_id": ["1"],
158-
"val": [1.0],
159-
"se": [0.1],
160-
"sample_size": [10.0]
161-
}),
162-
163-
# Row deleted and row added, but no missing columns (will not be uploaded)
164+
"geo_id": ["1", "3", "4", "6"],
165+
"val": [1.0, 3.0, np.nan, np.nan],
166+
"se": [0.1, 0.3, np.nan, np.nan],
167+
"sample_size": [10.0, 30.0, np.nan, np.nan],
168+
"missing_val": [Nans.NOT_MISSING] * 2 + [Nans.DELETED] + [Nans.CENSORED],
169+
"missing_se": [Nans.NOT_MISSING] * 2 + [Nans.DELETED] + [Nans.CENSORED],
170+
"missing_sample_size": [Nans.NOT_MISSING] * 2 + [Nans.DELETED] + [Nans.CENSORED],
171+
}),
172+
173+
# Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
174+
# and present again as nan, row 5 deleted previously and absent from file
175+
# (no missing columns)
164176
"csv7": pd.DataFrame({
165-
"geo_id": ["1"],
166-
"val": [1.0],
167-
"se": [0.1],
168-
"sample_size": [10.0]
169-
}),
170-
171-
# Row deleted and row added, but no missing columns
172-
"csv8": pd.DataFrame({
173-
"geo_id": ["1", "3"],
174-
"val": [1.0, 3.0],
175-
"se": [0.1, 0.3],
176-
"sample_size": [10.0, 30.0]
177-
}),
177+
"geo_id": ["1", "3", "4"],
178+
"val": [1.0, 3.0, np.nan],
179+
"se": [0.1, 0.3, np.nan],
180+
"sample_size": [10.0, 30.0, np.nan]
181+
})
178182
}
179183

184+
import shutil
185+
try:
186+
shutil.rmtree("/tmp/cache")
187+
shutil.rmtree("/tmp/export")
188+
except:
189+
...
190+
191+
def _assert_frames_equal_ignore_row_order(df1, df2, index_cols: List[str] =None):
192+
return assert_frame_equal(df1.set_index(index_cols).sort_index(), df2.set_index(index_cols).sort_index())
193+
180194
class TestArchiveDiffer:
181195

182196
def test_stubs(self):
@@ -194,14 +208,32 @@ def test_diff_and_filter_exports(self, tmp_path):
194208
mkdir(cache_dir)
195209
mkdir(export_dir)
196210

197-
csv1_diff = pd.DataFrame({
198-
"geo_id": ["3", "2", "4"],
199-
"val": [np.nan, 2.1, 4.0],
200-
"se": [np.nan, 0.21, np.nan],
201-
"sample_size": [np.nan, 21.0, 40.0],
202-
"missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
203-
"missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
204-
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
211+
expected_csv1_diff = pd.DataFrame({
212+
"geo_id": ["2", "3", "4"],
213+
"val": [2.1, np.nan, 4.0],
214+
"se": [0.21, np.nan, np.nan],
215+
"sample_size": [21.0, np.nan, 40.0],
216+
"missing_val": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
217+
"missing_se": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
218+
"missing_sample_size": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
219+
})
220+
expected_csv3 = CSVS_AFTER["csv3"]
221+
expected_csv4_diff = CSVS_AFTER["csv4"]
222+
expected_csv5_diff = CSVS_AFTER["csv5"]
223+
expected_csv6_diff = pd.DataFrame({
224+
"geo_id": ["2", "3", "6"],
225+
"val": [np.nan, 3.0, np.nan],
226+
"se": [np.nan, 0.3, np.nan],
227+
"sample_size": [np.nan, 30.0, np.nan],
228+
"missing_val": [Nans.DELETED, Nans.NOT_MISSING, Nans.CENSORED],
229+
"missing_se": [Nans.DELETED, Nans.NOT_MISSING, Nans.CENSORED],
230+
"missing_sample_size": [Nans.DELETED, Nans.NOT_MISSING, Nans.CENSORED],
231+
})
232+
expected_csv7_diff = pd.DataFrame({
233+
"geo_id": ["2", "3"],
234+
"val": [np.nan, 3.0],
235+
"se": [np.nan, 0.3],
236+
"sample_size": [np.nan, 30.0]
205237
})
206238

207239
arch_diff = ArchiveDiffer(cache_dir, export_dir)
@@ -225,7 +257,7 @@ def test_diff_and_filter_exports(self, tmp_path):
225257
# Check return values
226258
assert set(deleted_files) == {join(cache_dir, "csv2.csv")}
227259
assert set(common_diffs.keys()) == {
228-
join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv", "csv6.csv", "csv7.csv", "csv8.csv"]}
260+
join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv", "csv6.csv", "csv7.csv"]}
229261
assert set(new_files) == {join(export_dir, "csv3.csv")}
230262
assert common_diffs[join(export_dir, "csv0.csv")] is None
231263
assert common_diffs[join(export_dir, "csv1.csv")] == join(
@@ -238,13 +270,34 @@ def test_diff_and_filter_exports(self, tmp_path):
238270
"csv3.csv",
239271
"csv4.csv", "csv4.csv.diff",
240272
"csv5.csv", "csv5.csv.diff",
241-
"csv6.csv",
273+
"csv6.csv", "csv6.csv.diff",
242274
"csv7.csv", "csv7.csv.diff",
243-
"csv8.csv", "csv8.csv.diff"
244275
}
245-
assert_frame_equal(
276+
_assert_frames_equal_ignore_row_order(
246277
pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
247-
csv1_diff)
278+
expected_csv1_diff,
279+
index_cols=["geo_id"]
280+
)
281+
_assert_frames_equal_ignore_row_order(
282+
pd.read_csv(join(export_dir, "csv4.csv.diff"), dtype=CSV_DTYPES_OLD),
283+
expected_csv4_diff,
284+
index_cols=["geo_id"]
285+
)
286+
_assert_frames_equal_ignore_row_order(
287+
pd.read_csv(join(export_dir, "csv5.csv.diff"), dtype=CSV_DTYPES_OLD),
288+
expected_csv5_diff,
289+
index_cols=["geo_id"]
290+
)
291+
_assert_frames_equal_ignore_row_order(
292+
pd.read_csv(join(export_dir, "csv6.csv.diff"), dtype=CSV_DTYPES_OLD),
293+
expected_csv6_diff,
294+
index_cols=["geo_id"]
295+
)
296+
_assert_frames_equal_ignore_row_order(
297+
pd.read_csv(join(export_dir, "csv7.csv.diff"), dtype=CSV_DTYPES_OLD),
298+
expected_csv7_diff,
299+
index_cols=["geo_id"]
300+
)
248301

249302
# Test filter_exports
250303
# ===================
@@ -259,10 +312,37 @@ def test_diff_and_filter_exports(self, tmp_path):
259312
arch_diff.filter_exports(common_diffs)
260313

261314
# Check exports directory just has incremental changes
262-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv7.csv", "csv8.csv"}
263-
assert_frame_equal(
315+
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv6.csv", "csv7.csv"}
316+
_assert_frames_equal_ignore_row_order(
264317
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
265-
csv1_diff)
318+
expected_csv1_diff,
319+
index_cols=["geo_id"]
320+
)
321+
_assert_frames_equal_ignore_row_order(
322+
pd.read_csv(join(export_dir, "csv3.csv"), dtype=CSV_DTYPES),
323+
expected_csv3,
324+
index_cols=["geo_id"]
325+
)
326+
_assert_frames_equal_ignore_row_order(
327+
pd.read_csv(join(export_dir, "csv4.csv"), dtype=CSV_DTYPES),
328+
expected_csv4_diff,
329+
index_cols=["geo_id"]
330+
)
331+
_assert_frames_equal_ignore_row_order(
332+
pd.read_csv(join(export_dir, "csv5.csv"), dtype=CSV_DTYPES),
333+
expected_csv5_diff,
334+
index_cols=["geo_id"]
335+
)
336+
_assert_frames_equal_ignore_row_order(
337+
pd.read_csv(join(export_dir, "csv6.csv"), dtype=CSV_DTYPES),
338+
expected_csv6_diff,
339+
index_cols=["geo_id"]
340+
)
341+
_assert_frames_equal_ignore_row_order(
342+
pd.read_csv(join(export_dir, "csv7.csv"), dtype=CSV_DTYPES),
343+
expected_csv7_diff,
344+
index_cols=["geo_id"]
345+
)
266346

267347

268348
AWS_CREDENTIALS = {
@@ -384,7 +464,7 @@ def test_run(self, tmp_path, s3_client):
384464
assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df)
385465

386466
# Check exports directory just has incremental changes
387-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv7.csv", "csv8.csv"}
467+
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv6.csv", "csv7.csv"}
388468
csv1_diff = pd.DataFrame({
389469
"geo_id": ["3", "2", "4"],
390470
"val": [np.nan, 2.1, 4.0],
@@ -394,9 +474,11 @@ def test_run(self, tmp_path, s3_client):
394474
"missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
395475
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
396476
})
397-
assert_frame_equal(
477+
_assert_frames_equal_ignore_row_order(
398478
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
399-
csv1_diff)
479+
csv1_diff,
480+
index_cols=["geo_id"]
481+
)
400482

401483

402484
class TestGitArchiveDiffer:
@@ -596,7 +678,7 @@ def test_run(self, tmp_path):
596678
original_branch.checkout()
597679

598680
# Check exports directory just has incremental changes
599-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv7.csv", "csv8.csv"}
681+
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv", "csv6.csv", "csv7.csv"}
600682
csv1_diff = pd.DataFrame({
601683
"geo_id": ["3", "2", "4"],
602684
"val": [np.nan, 2.1, 4.0],
@@ -606,9 +688,11 @@ def test_run(self, tmp_path):
606688
"missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
607689
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
608690
})
609-
assert_frame_equal(
691+
_assert_frames_equal_ignore_row_order(
610692
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
611-
csv1_diff)
693+
csv1_diff,
694+
index_cols=["geo_id"]
695+
)
612696

613697

614698
class TestFromParams:

0 commit comments

Comments
 (0)