Skip to content

Commit e6ed963

Browse files
committed
Merge branch 'nancodes' into nans_cdc_covidnet
2 parents cf522d4 + de3abd6 commit e6ed963

File tree

2 files changed

+70
-10
lines changed

2 files changed

+70
-10
lines changed

_delphi_utils_python/delphi_utils/archive.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,15 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]:
254254
new_issues_df.to_csv(diff_file, na_rep="NA")
255255
common_diffs[after_file] = diff_file
256256

257+
# Replace deleted files with empty versions, but only if the cached version is not
258+
# already empty
259+
for deleted_file in deleted_files:
260+
deleted_df = pd.read_csv(deleted_file)
261+
if not deleted_df.empty:
262+
empty_df = deleted_df[0:0]
263+
new_deleted_filename = join(self.export_dir, basename(deleted_file))
264+
empty_df.to_csv(new_deleted_filename, index=False)
265+
257266
return deleted_files, common_diffs, new_files
258267

259268
def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]:
@@ -280,9 +289,10 @@ def filter_exports(self, common_diffs: FileDiffMap):
280289
Filter export directory to only contain relevant files.
281290
282291
Filters down the export_dir to only contain:
283-
1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows only.
284-
Should be called after archive_exports() so we archive the raw exports before
285-
potentially modifying them.
292+
1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows
293+
only, and 3) Deleted files replaced with empty CSVs with the same name. Should
294+
be called after archive_exports() so we archive the raw exports before potentially
295+
modifying them.
286296
287297
Parameters
288298
----------
@@ -311,9 +321,9 @@ def run(self):
311321
self.update_cache()
312322

313323
# Diff exports, and make incremental versions
314-
_, common_diffs, new_files = self.diff_exports()
324+
deleted_files, common_diffs, new_files = self.diff_exports()
315325

316-
# Archive changed and new files only
326+
# Archive changed, new, and emptied deleted files
317327
to_archive = [f for f, diff in common_diffs.items()
318328
if diff is not None]
319329
to_archive += new_files

_delphi_utils_python/tests/test_archive.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,19 @@ def test_diff_and_filter_exports(self, tmp_path):
134134
"missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
135135
})
136136

137+
csv2_deleted = pd.DataFrame(
138+
np.empty(0, dtype=[
139+
("geo_id", str),
140+
("val", float),
141+
("se", float),
142+
("sample_size", float),
143+
("missing_val", int),
144+
("missing_se", int),
145+
("missing_sample_size", int)
146+
]),
147+
index=[]
148+
)
149+
137150
arch_diff = ArchiveDiffer(cache_dir, export_dir)
138151

139152
# Test diff_exports
@@ -163,7 +176,10 @@ def test_diff_and_filter_exports(self, tmp_path):
163176

164177
# Check filesystem for actual files
165178
assert set(listdir(export_dir)) == {
166-
"csv0.csv", "csv1.csv", "csv1.csv.diff", "csv3.csv", "csv4.csv", "csv4.csv.diff"}
179+
"csv0.csv", "csv1.csv", "csv1.csv.diff",
180+
"csv3.csv", "csv4.csv", "csv4.csv.diff",
181+
"csv2.csv"
182+
}
167183
assert_frame_equal(
168184
pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
169185
csv1_diff)
@@ -180,8 +196,11 @@ def test_diff_and_filter_exports(self, tmp_path):
180196

181197
arch_diff.filter_exports(common_diffs)
182198

183-
# Check exports directory just has incremental changes
184-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
199+
# Check exports directory just has incremental and deleted changes
200+
assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"}
201+
assert_frame_equal(
202+
pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES),
203+
csv2_deleted)
185204
assert_frame_equal(
186205
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
187206
csv1_diff)
@@ -308,7 +327,7 @@ def test_run(self, tmp_path, s3_client):
308327
assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df)
309328

310329
# Check exports directory just has incremental changes
311-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
330+
assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"}
312331
csv1_diff = pd.DataFrame({
313332
"geo_id": ["3", "2", "4"],
314333
"val": [np.nan, 2.1, 4.0],
@@ -321,6 +340,21 @@ def test_run(self, tmp_path, s3_client):
321340
assert_frame_equal(
322341
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
323342
csv1_diff)
343+
csv2_deleted = pd.DataFrame(
344+
np.empty(0, dtype=[
345+
("geo_id", str),
346+
("val", float),
347+
("se", float),
348+
("sample_size", float),
349+
("missing_val", int),
350+
("missing_se", int),
351+
("missing_sample_size", int)
352+
]),
353+
index=[]
354+
)
355+
assert_frame_equal(
356+
pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES),
357+
csv2_deleted)
324358

325359

326360
class TestGitArchiveDiffer:
@@ -521,7 +555,7 @@ def test_run(self, tmp_path):
521555
original_branch.checkout()
522556

523557
# Check exports directory just has incremental changes
524-
assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
558+
assert set(listdir(export_dir)) == {"csv1.csv", "csv2.csv", "csv3.csv", "csv4.csv"}
525559
csv1_diff = pd.DataFrame({
526560
"geo_id": ["3", "2", "4"],
527561
"val": [np.nan, 2.1, 4.0],
@@ -534,6 +568,22 @@ def test_run(self, tmp_path):
534568
assert_frame_equal(
535569
pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
536570
csv1_diff)
571+
csv2_deleted = pd.DataFrame(
572+
np.empty(0, dtype=[
573+
("geo_id", str),
574+
("val", float),
575+
("se", float),
576+
("sample_size", float),
577+
("missing_val", int),
578+
("missing_se", int),
579+
("missing_sample_size", int)
580+
]),
581+
index=[]
582+
)
583+
assert_frame_equal(
584+
pd.read_csv(join(export_dir, "csv2.csv"), dtype=CSV_DTYPES),
585+
csv2_deleted)
586+
537587

538588

539589
class TestFromParams:

0 commit comments

Comments
 (0)