Skip to content

Commit 6b5263d

Browse files
authored
Merge pull request #1522 from cmu-delphi/dshem/fix-archiver
Fix archiver bug ignoring deletions when comparing two files with no missing columns
2 parents 019eb2c + 3e4df04 commit 6b5263d

File tree

2 files changed

+118
-119
lines changed

2 files changed

+118
-119
lines changed

_delphi_utils_python/delphi_utils/archive.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
Files = List[str]
5050
FileDiffMap = Dict[str, Optional[str]]
5151

52+
EXPORT_CSV_DTYPES = {
53+
"geo_id": str, "val": float, "se": float, "sample_size": float,
54+
"missing_val": "Int64", "missing_se": "Int64", "missing_sample_size": "Int64"
55+
}
5256

5357
def diff_export_csv(
5458
before_csv: str,
@@ -75,15 +79,10 @@ def diff_export_csv(
7579
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
7680
added_df is the pd.DataFrame of added rows from after_csv.
7781
"""
78-
export_csv_dtypes = {
79-
"geo_id": str, "val": float, "se": float, "sample_size": float,
80-
"missing_val": int, "missing_se": int, "missing_sample_size": int
81-
}
82-
83-
before_df = pd.read_csv(before_csv, dtype=export_csv_dtypes)
82+
before_df = pd.read_csv(before_csv, dtype=EXPORT_CSV_DTYPES)
8483
before_df.set_index("geo_id", inplace=True)
8584
before_df = before_df.round({"val": 7, "se": 7})
86-
after_df = pd.read_csv(after_csv, dtype=export_csv_dtypes)
85+
after_df = pd.read_csv(after_csv, dtype=EXPORT_CSV_DTYPES)
8786
after_df.set_index("geo_id", inplace=True)
8887
after_df = after_df.round({"val": 7, "se": 7})
8988
deleted_idx = before_df.index.difference(after_df.index)
@@ -93,20 +92,21 @@ def diff_export_csv(
9392
before_df_cmn = before_df.reindex(common_idx)
9493
after_df_cmn = after_df.reindex(common_idx)
9594

96-
# If CSVs have different columns (no missingness), mark all values as new
97-
if ("missing_val" in before_df_cmn.columns) ^ ("missing_val" in after_df_cmn.columns):
95+
# If new CSV has missingness columns, but old doesn't, mark all values as new
96+
if ("missing_val" not in before_df_cmn.columns) & ("missing_val" in after_df_cmn.columns):
9897
same_mask = after_df_cmn.copy()
9998
same_mask.loc[:] = False
10099
else:
101100
# Exact comparisons, treating NA == NA as True
102101
same_mask = before_df_cmn == after_df_cmn
103102
same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)
104103

105-
# Code deleted entries as nans with the deleted missing code
104+
# Any deleted entries become rows with nans and the deleted missing code
106105
deleted_df = before_df.loc[deleted_idx, :].copy()
107106
deleted_df[["val", "se", "sample_size"]] = np.nan
108-
if "missing_val" in after_df_cmn.columns:
109-
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
107+
# If the new file doesn't have missing columsn, then when the deleted, changed, and added
108+
# rows are concatenated (in diff_exports), they will default to NA
109+
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
110110

111111
return (
112112
deleted_df,

0 commit comments

Comments
 (0)