Update utilities for NAN codes:

dshemetov · dshemetov · commit 2ea630bfa2b8 · 2021-04-26T16:31:33.000-07:00
* update export utility to export, validate, and test the missing cols
* add deletion coding to the archiver, make it expect missing cols, and
  let it handle comparisons between missing and non-missing CSVs
diff --git a/_delphi_utils_python/delphi_utils/archive.py b/_delphi_utils_python/delphi_utils/archive.py
@@ -77,7 +77,7 @@ def diff_export_csv(
     """
     export_csv_dtypes = {
         "geo_id": str, "val": float, "se": float, "sample_size": float,
-        "missing_val": int, "missing_se":int, "missing_sample_size": int
+        "missing_val": int, "missing_se": int, "missing_sample_size": int
     }
 
     before_df = pd.read_csv(before_csv, dtype=export_csv_dtypes)
@@ -93,9 +93,14 @@ def diff_export_csv(
     before_df_cmn = before_df.reindex(common_idx)
     after_df_cmn = after_df.reindex(common_idx)
 
-    # Exact comparisons, treating NA == NA as True
-    same_mask = before_df_cmn == after_df_cmn
-    same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)
+    # If CSVs have different columns (no missingness), mark all values as new
+    if ("missing_val" in before_df_cmn.columns) ^ ("missing_val" in after_df_cmn.columns):
+        same_mask = after_df_cmn.copy()
+        same_mask.loc[:] = False
+    else:
+        # Exact comparisons, treating NA == NA as True
+        same_mask = before_df_cmn == after_df_cmn
+        same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)
 
     # Code deleted entries as nans with the deleted missing code
     deleted_df = before_df.loc[deleted_idx, :].copy()
diff --git a/_delphi_utils_python/delphi_utils/export.py b/_delphi_utils_python/delphi_utils/export.py
@@ -12,42 +12,21 @@
 
 def filter_contradicting_missing_codes(df, sensor, metric, date, logger=None):
     """Find values with contradictory missingness codes, filter them, and log."""
-    val_contradictory_missing_mask = (
-        (df["val"].isna() & df["missing_val"].eq(Nans.NOT_MISSING))
-        |
-        (df["val"].notna() & df["missing_val"].ne(Nans.NOT_MISSING))
-    )
-    se_contradictory_missing_mask = (
-        (df["se"].isna() & df["missing_se"].eq(Nans.NOT_MISSING))
-        |
-        (df["se"].notna() & df["missing_se"].ne(Nans.NOT_MISSING))
-    )
-    sample_size_contradictory_missing_mask = (
-        (df["sample_size"].isna() & df["missing_sample_size"].eq(Nans.NOT_MISSING))
-        |
-        (df["sample_size"].notna() & df["missing_sample_size"].ne(Nans.NOT_MISSING))
-    )
-    if df.loc[val_contradictory_missing_mask].size > 0:
-        if not logger is None:
+    columns = ["val", "se", "sample_size"]
+    # Get indicies where the XNOR is true (i.e. both are true or both are false).
+    masks = [
+        ~(df[column].isna() ^ df["missing_" + column].eq(Nans.NOT_MISSING))
+        for column in columns
+    ]
+    for mask in masks:
+        if not logger is None and df.loc[mask].size > 0:
             logger.info(
                 "Filtering contradictory missing code in " +
                 "{0}_{1}_{2}.".format(sensor, metric, date.strftime(format="%Y-%m-%d"))
             )
-        df = df.loc[~val_contradictory_missing_mask]
-    if df.loc[se_contradictory_missing_mask].size > 0:
-        if not logger is None:
-            logger.info(
-                "Filtering contradictory missing code in " +
-                "{0}_{1}_{2}.".format(sensor, metric, date.strftime(format="%Y-%m-%d"))
-            )
-        df = df.loc[~se_contradictory_missing_mask]
-    if df.loc[sample_size_contradictory_missing_mask].size > 0:
-        if not logger is None:
-            logger.info(
-                "Filtering contradictory missing code in " +
-                "{0}_{1}_{2}.".format(sensor, metric, date.strftime(format="%Y-%m-%d"))
-            )
-        df = df.loc[~se_contradictory_missing_mask]
+            df = df.loc[~mask]
+        elif logger is None and df.loc[mask].size > 0:
+            df = df.loc[~mask]
     return df
 
 def create_export_csv(
diff --git a/_delphi_utils_python/delphi_utils/nancodes.py b/_delphi_utils_python/delphi_utils/nancodes.py
@@ -1,10 +1,4 @@
-"""Provides unified not-a-number codes for the indicators.
-
-Currently requires a manual sync between the covidcast-indicators
-and the delphi-epidata repo.
-* in covidcast-indicators: _delphi_utils_python/delphi_utils
-* in delphi-epidata: src/acquisition/covidcast
-"""
+"""Unified not-a-number codes for CMU Delphi codebase."""
 
 from enum import IntEnum
 
diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py
@@ -52,6 +52,14 @@
         "missing_se": [Nans.NOT_MISSING],
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
+
+    # Common, but updated with missing columns
+    "csv4": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0]
+        }),
 }
 
 CSVS_AFTER = {
@@ -86,8 +94,18 @@
         "missing_se": [Nans.NOT_MISSING],
         "missing_sample_size": [Nans.NOT_MISSING],
         }),
-}
 
+    # Common, but updated with missing columns
+    "csv4": pd.DataFrame({
+        "geo_id": ["1"],
+        "val": [1.0],
+        "se": [0.1],
+        "sample_size": [10.0],
+        "missing_val": [Nans.NOT_MISSING],
+        "missing_se": [Nans.NOT_MISSING],
+        "missing_sample_size": [Nans.NOT_MISSING],
+        }),
+}
 
 class TestArchiveDiffer:
 
@@ -137,15 +155,15 @@ def test_diff_and_filter_exports(self, tmp_path):
         # Check return values
         assert set(deleted_files) == {join(cache_dir, "csv2.csv")}
         assert set(common_diffs.keys()) == {
-            join(export_dir, f) for f in ["csv0.csv", "csv1.csv"]}
+            join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv"]}
         assert set(new_files) == {join(export_dir, "csv3.csv")}
         assert common_diffs[join(export_dir, "csv0.csv")] is None
         assert common_diffs[join(export_dir, "csv1.csv")] == join(
             export_dir, "csv1.csv.diff")
 
         # Check filesystem for actual files
         assert set(listdir(export_dir)) == {
-            "csv0.csv", "csv1.csv", "csv1.csv.diff", "csv3.csv"}
+            "csv0.csv", "csv1.csv", "csv1.csv.diff", "csv3.csv", "csv4.csv", "csv4.csv.diff"}
         assert_frame_equal(
             pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
             csv1_diff)
@@ -163,7 +181,7 @@ def test_diff_and_filter_exports(self, tmp_path):
         arch_diff.filter_exports(common_diffs)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
         assert_frame_equal(
             pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
             csv1_diff)
@@ -290,7 +308,7 @@ def test_run(self, tmp_path, s3_client):
             assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), df)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
         csv1_diff = pd.DataFrame({
             "geo_id": ["3", "2", "4"],
             "val": [np.nan, 2.1, 4.0],
@@ -503,7 +521,7 @@ def test_run(self, tmp_path):
         original_branch.checkout()
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv"}
+        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv"}
         csv1_diff = pd.DataFrame({
             "geo_id": ["3", "2", "4"],
             "val": [np.nan, 2.1, 4.0],