NANs Safegraph Patterns:

dshemetov · dshemetov · commit 58e734c19194 · 2021-04-27T14:06:34.000-07:00
* add missing columns to the export dataframe
* basic support and unknown reason missingness detection
diff --git a/safegraph_patterns/delphi_safegraph_patterns/process.py b/safegraph_patterns/delphi_safegraph_patterns/process.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 
-from delphi_utils import create_export_csv, GeoMapper
+from delphi_utils import create_export_csv, GeoMapper, Nans
 
 INCIDENCE_BASE = 100000
 
@@ -124,6 +124,21 @@ def aggregate(df, metric, geo_res):
                             * INCIDENCE_BASE
     return df.rename({geo_key: "geo_id"}, axis=1)
 
+def add_nancodes(df):
+    """Add nan codes to the df."""
+    # Values are not missing unless found to be null
+    df["missing_val"] = Nans.NOT_MISSING
+
+    # These values aren't reported, so are always missing
+    df["missing_se"] = Nans.NOT_APPLICABLE
+    df["missing_sample_size"] = Nans.NOT_APPLICABLE
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = (df["missing_val"] == 0) & df["val"].isnull()
+    df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
+
+    return df
+
 def process(fname, sensors, metrics, geo_resolutions,
             export_dir, brand_df):
     """
@@ -185,6 +200,7 @@ def process(fname, sensors, metrics, geo_resolutions,
             df_export["val"] = df_export["_".join([metric, sensor])]
             df_export["se"] = np.nan
             df_export["sample_size"] = np.nan
+            df_export = add_nancodes(df_export)
 
             if wip:
                 metric = "wip_" + metric
diff --git a/safegraph_patterns/tests/test_run.py b/safegraph_patterns/tests/test_run.py
@@ -29,12 +29,34 @@ def test_output_files(self, clean_receiving_dir):
         csv_files = listdir("receiving")
 
         dates = [
-            "20190722", "20190723", "20190724", "20190725", "20190726",
-            "20190727", "20190728", "20190729", "20190730", "20190731",
-            "20190801", "20190802", "20190803", "20190804",
-            "20200727", "20200728", "20200729", "20200730", "20200731",
-            "20200801", "20200802", "20200803", "20200804", "20200805",
-            "20200806", "20200807", "20200808", "20200809"
+            "20190722",
+            "20190723",
+            "20190724",
+            "20190725",
+            "20190726",
+            "20190727",
+            "20190728",
+            "20190729",
+            "20190730",
+            "20190731",
+            "20190801",
+            "20190802",
+            "20190803",
+            "20190804",
+            "20200727",
+            "20200728",
+            "20200729",
+            "20200730",
+            "20200731",
+            "20200801",
+            "20200802",
+            "20200803",
+            "20200804",
+            "20200805",
+            "20200806",
+            "20200807",
+            "20200808",
+            "20200809",
         ]
 
         expected_files = []
@@ -48,7 +70,14 @@ def test_output_files(self, clean_receiving_dir):
         assert set(expected_files).issubset(set(csv_files))
 
         # Test output format
-        df = pd.read_csv(
-            join("./receiving", "20200729_state_bars_visit_num.csv")
-        )
-        assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
+        df = pd.read_csv(join("./receiving", "20200729_state_bars_visit_num.csv"))
+        expected_columns = [
+            "geo_id",
+            "val",
+            "se",
+            "sample_size",
+            "missing_val",
+            "missing_se",
+            "missing_sample_size",
+        ]
+        assert (df.columns.values == expected_columns).all()