Skip to content

Commit 3e4df04

Browse files
committed
Code deletions properly, remove redundant tests
1 parent 54ef6d4 commit 3e4df04

File tree

2 files changed

+72
-168
lines changed

2 files changed

+72
-168
lines changed

_delphi_utils_python/delphi_utils/archive.py

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
Files = List[str]
5050
FileDiffMap = Dict[str, Optional[str]]
5151

52+
EXPORT_CSV_DTYPES = {
53+
"geo_id": str, "val": float, "se": float, "sample_size": float,
54+
"missing_val": "Int64", "missing_se": "Int64", "missing_sample_size": "Int64"
55+
}
5256

5357
def diff_export_csv(
5458
before_csv: str,
@@ -75,15 +79,10 @@ def diff_export_csv(
7579
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
7680
added_df is the pd.DataFrame of added rows from after_csv.
7781
"""
78-
export_csv_dtypes = {
79-
"geo_id": str, "val": float, "se": float, "sample_size": float,
80-
"missing_val": int, "missing_se": int, "missing_sample_size": int
81-
}
82-
83-
before_df = pd.read_csv(before_csv, dtype=export_csv_dtypes)
82+
before_df = pd.read_csv(before_csv, dtype=EXPORT_CSV_DTYPES)
8483
before_df.set_index("geo_id", inplace=True)
8584
before_df = before_df.round({"val": 7, "se": 7})
86-
after_df = pd.read_csv(after_csv, dtype=export_csv_dtypes)
85+
after_df = pd.read_csv(after_csv, dtype=EXPORT_CSV_DTYPES)
8786
after_df.set_index("geo_id", inplace=True)
8887
after_df = after_df.round({"val": 7, "se": 7})
8988
deleted_idx = before_df.index.difference(after_df.index)
@@ -93,31 +92,21 @@ def diff_export_csv(
9392
before_df_cmn = before_df.reindex(common_idx)
9493
after_df_cmn = after_df.reindex(common_idx)
9594

96-
# If CSVs have different columns (no missingness), mark all values as new
97-
if ("missing_val" in before_df_cmn.columns) ^ ("missing_val" in after_df_cmn.columns):
95+
# If new CSV has missingness columns, but old doesn't, mark all values as new
96+
if ("missing_val" not in before_df_cmn.columns) & ("missing_val" in after_df_cmn.columns):
9897
same_mask = after_df_cmn.copy()
9998
same_mask.loc[:] = False
10099
else:
101100
# Exact comparisons, treating NA == NA as True
102101
same_mask = before_df_cmn == after_df_cmn
103102
same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)
104103

105-
# Code deleted entries as nans with the deleted missing code
104+
# Any deleted entries become rows with nans and the deleted missing code
106105
deleted_df = before_df.loc[deleted_idx, :].copy()
107106
deleted_df[["val", "se", "sample_size"]] = np.nan
108-
if "missing_val" in after_df_cmn.columns:
109-
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
110-
111-
# Remove deleted entries that were already present
112-
if deleted_idx.size > 0:
113-
deleted_same_mask = deleted_df == before_df.loc[deleted_idx, :]
114-
deleted_same_mask |= pd.isna(deleted_df) & pd.isna(before_df.loc[deleted_idx, :])
115-
deleted_df = deleted_df.loc[~(deleted_same_mask.all(axis=1)), :]
116-
117-
# If the new file has no missing columns, then we should remove them from
118-
# the deletions too
119-
if "missing_val" not in after_df_cmn.columns:
120-
deleted_df = deleted_df[["val", "se", "sample_size"]]
107+
# If the new file doesn't have missing columsn, then when the deleted, changed, and added
108+
# rows are concatenated (in diff_exports), they will default to NA
109+
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
121110

122111
return (
123112
deleted_df,

0 commit comments

Comments
 (0)