NANs Safegraph Patterns:

dshemetov · dshemetov · commit 1df3553eb5cb · 2021-02-22T14:35:46.000-08:00
* add missing columns to the export dataframe
* basic support and unknown reason missingness detection
diff --git a/safegraph_patterns/delphi_safegraph_patterns/process.py b/safegraph_patterns/delphi_safegraph_patterns/process.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 
-from delphi_utils import create_export_csv, GeoMapper
+from delphi_utils import create_export_csv, GeoMapper, NAN_CODES
 
 INCIDENCE_BASE = 100000
 
@@ -124,6 +124,21 @@ def aggregate(df, metric, geo_res):
                             * INCIDENCE_BASE
     return df.rename({geo_key: "geo_id"}, axis=1)
 
+def add_nancodes(df):
+    """Add nan codes to the df."""
+    # Values are not missing unless found to be null
+    df["missing_val"] = NAN_CODES["Not Missing"]
+
+    # These values aren't reported, so are always missing
+    df["missing_se"] = NAN_CODES["Not Applicable"]
+    df["missing_sample_size"] = NAN_CODES["Not Applicable"]
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = (df["missing_val"] == 0) & df["val"].isnull()
+    df.loc[remaining_nans_mask, "missing_val"] = NAN_CODES["Unknown"]
+
+    return df
+
 def process(fname, sensors, metrics, geo_resolutions,
             export_dir, brand_df):
     """
@@ -185,6 +200,7 @@ def process(fname, sensors, metrics, geo_resolutions,
             df_export["val"] = df_export["_".join([metric, sensor])]
             df_export["se"] = np.nan
             df_export["sample_size"] = np.nan
+            df_export = add_nancodes(df_export)
 
             if wip:
                 metric = "wip_" + metric
diff --git a/safegraph_patterns/tests/test_process.py b/safegraph_patterns/tests/test_process.py
@@ -14,7 +14,7 @@
 metric_names, naics_codes, _ = (list(x) for x in zip(*METRICS))
 
 brand_df = pd.read_csv(
-                join("./static", f"brand_info/brand_info_202004.csv")
+                join("./static", "brand_info/brand_info_202004.csv")
         )
 
 class TestProcess:
@@ -23,31 +23,29 @@ def test_construct_signals_present(self):
         df = pd.read_csv('test_data/sample_raw_data.csv',
                          parse_dates=["date_range_start", "date_range_end"])
         dfs = construct_signals(df, metric_names, naics_codes, brand_df)
-        assert set(["timestamp", "zip", 
+        assert set(["timestamp", "zip",
                     "bars_visit_num"]) == set(dfs["bars_visit"].columns)
         assert set(["timestamp", "zip", "restaurants_visit_num"]) == \
                   set(dfs["restaurants_visit"].columns)
         assert dfs["bars_visit"]["timestamp"].unique().shape[0] == 7
         assert dfs["restaurants_visit"]["timestamp"].unique().shape[0] == 7
 
     def test_aggregate_county(self):
-    
+
         df = pd.read_csv('test_data/sample_filtered_data.csv', parse_dates=["timestamp"])
         df_export = aggregate(df, "bars_visit", "county")
 
         assert np.all(df_export["bars_visit_num"].values >= 0)
         assert np.all(df_export["bars_visit_prop"].dropna().values <= INCIDENCE_BASE)
-        assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop", 
+        assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop",
                     "population"]) == set(df_export.columns)
 
     def test_aggregate_state(self):
-    
+
         df = pd.read_csv('test_data/sample_filtered_data.csv', parse_dates=["timestamp"])
         df_export = aggregate(df, "bars_visit", "state")
 
         assert np.all(df_export["bars_visit_num"].values >= 0)
         assert np.all(df_export["bars_visit_prop"].dropna().values <= INCIDENCE_BASE)
-        assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop", 
+        assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop",
                     "population"]) == set(df_export.columns)
-
-
diff --git a/safegraph_patterns/tests/test_run.py b/safegraph_patterns/tests/test_run.py
@@ -3,9 +3,8 @@
 
 import pandas as pd
 
-from delphi_safegraph_patterns.run import (run_module, METRICS,
-                                           SENSORS, GEO_RESOLUTIONS)
-                                         
+from delphi_safegraph_patterns.run import run_module, METRICS, SENSORS, GEO_RESOLUTIONS
+
 
 class TestRun:
     def test_output_files(self, run_as_module):
@@ -14,12 +13,34 @@ def test_output_files(self, run_as_module):
         csv_files = listdir("receiving")
 
         dates = [
-            "20190722", "20190723", "20190724", "20190725", "20190726",
-            "20190727", "20190728", "20190729", "20190730", "20190731",
-            "20190801", "20190802", "20190803", "20190804",
-            "20200727", "20200728", "20200729", "20200730", "20200731",
-            "20200801", "20200802", "20200803", "20200804", "20200805",
-            "20200806", "20200807", "20200808", "20200809"
+            "20190722",
+            "20190723",
+            "20190724",
+            "20190725",
+            "20190726",
+            "20190727",
+            "20190728",
+            "20190729",
+            "20190730",
+            "20190731",
+            "20190801",
+            "20190802",
+            "20190803",
+            "20190804",
+            "20200727",
+            "20200728",
+            "20200729",
+            "20200730",
+            "20200731",
+            "20200801",
+            "20200802",
+            "20200803",
+            "20200804",
+            "20200805",
+            "20200806",
+            "20200807",
+            "20200808",
+            "20200809",
         ]
 
         expected_files = []
@@ -33,7 +54,14 @@ def test_output_files(self, run_as_module):
         assert set(expected_files).issubset(set(csv_files))
 
         # Test output format
-        df = pd.read_csv(
-            join("./receiving", "20200729_state_bars_visit_num.csv")
-        )
-        assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
+        df = pd.read_csv(join("./receiving", "20200729_state_bars_visit_num.csv"))
+        expected_columns = [
+            "geo_id",
+            "val",
+            "se",
+            "sample_size",
+            "missing_val",
+            "missing_se",
+            "missing_sample_size",
+        ]
+        assert (df.columns.values == expected_columns).all()