49
49
Files = List [str ]
50
50
FileDiffMap = Dict [str , Optional [str ]]
51
51
52
+ EXPORT_CSV_DTYPES = {
53
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
54
+ "missing_val" : "Int64" , "missing_se" : "Int64" , "missing_sample_size" : "Int64"
55
+ }
52
56
53
57
def diff_export_csv (
54
58
before_csv : str ,
@@ -75,15 +79,10 @@ def diff_export_csv(
75
79
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
76
80
added_df is the pd.DataFrame of added rows from after_csv.
77
81
"""
78
- export_csv_dtypes = {
79
- "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
80
- "missing_val" : int , "missing_se" : int , "missing_sample_size" : int
81
- }
82
-
83
- before_df = pd .read_csv (before_csv , dtype = export_csv_dtypes )
82
+ before_df = pd .read_csv (before_csv , dtype = EXPORT_CSV_DTYPES )
84
83
before_df .set_index ("geo_id" , inplace = True )
85
84
before_df = before_df .round ({"val" : 7 , "se" : 7 })
86
- after_df = pd .read_csv (after_csv , dtype = export_csv_dtypes )
85
+ after_df = pd .read_csv (after_csv , dtype = EXPORT_CSV_DTYPES )
87
86
after_df .set_index ("geo_id" , inplace = True )
88
87
after_df = after_df .round ({"val" : 7 , "se" : 7 })
89
88
deleted_idx = before_df .index .difference (after_df .index )
@@ -93,20 +92,21 @@ def diff_export_csv(
93
92
before_df_cmn = before_df .reindex (common_idx )
94
93
after_df_cmn = after_df .reindex (common_idx )
95
94
96
- # If CSVs have different columns (no missingness) , mark all values as new
97
- if ("missing_val" in before_df_cmn .columns ) ^ ("missing_val" in after_df_cmn .columns ):
95
+ # If new CSV has missingness columns, but old doesn't , mark all values as new
96
+ if ("missing_val" not in before_df_cmn .columns ) & ("missing_val" in after_df_cmn .columns ):
98
97
same_mask = after_df_cmn .copy ()
99
98
same_mask .loc [:] = False
100
99
else :
101
100
# Exact comparisons, treating NA == NA as True
102
101
same_mask = before_df_cmn == after_df_cmn
103
102
same_mask |= pd .isna (before_df_cmn ) & pd .isna (after_df_cmn )
104
103
105
- # Code deleted entries as nans with the deleted missing code
104
+ # Any deleted entries become rows with nans and the deleted missing code
106
105
deleted_df = before_df .loc [deleted_idx , :].copy ()
107
106
deleted_df [["val" , "se" , "sample_size" ]] = np .nan
108
- if "missing_val" in after_df_cmn .columns :
109
- deleted_df [["missing_val" , "missing_se" , "missing_sample_size" ]] = Nans .DELETED
107
+ # If the new file doesn't have missing columsn, then when the deleted, changed, and added
108
+ # rows are concatenated (in diff_exports), they will default to NA
109
+ deleted_df [["missing_val" , "missing_se" , "missing_sample_size" ]] = Nans .DELETED
110
110
111
111
return (
112
112
deleted_df ,
0 commit comments