Skip to content

Commit 58e734c

Browse files
committed
NANs Safegraph Patterns:
* add missing columns to the export dataframe * basic support and unknown reason missingness detection
1 parent 2219278 commit 58e734c

File tree

2 files changed

+56
-11
lines changed

2 files changed

+56
-11
lines changed

safegraph_patterns/delphi_safegraph_patterns/process.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77
import pandas as pd
88

9-
from delphi_utils import create_export_csv, GeoMapper
9+
from delphi_utils import create_export_csv, GeoMapper, Nans
1010

1111
INCIDENCE_BASE = 100000
1212

@@ -124,6 +124,21 @@ def aggregate(df, metric, geo_res):
124124
* INCIDENCE_BASE
125125
return df.rename({geo_key: "geo_id"}, axis=1)
126126

127+
def add_nancodes(df):
128+
"""Add nan codes to the df."""
129+
# Values are not missing unless found to be null
130+
df["missing_val"] = Nans.NOT_MISSING
131+
132+
# These values aren't reported, so are always missing
133+
df["missing_se"] = Nans.NOT_APPLICABLE
134+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
135+
136+
# Mark any remaining nans with unknown
137+
remaining_nans_mask = (df["missing_val"] == 0) & df["val"].isnull()
138+
df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
139+
140+
return df
141+
127142
def process(fname, sensors, metrics, geo_resolutions,
128143
export_dir, brand_df):
129144
"""
@@ -185,6 +200,7 @@ def process(fname, sensors, metrics, geo_resolutions,
185200
df_export["val"] = df_export["_".join([metric, sensor])]
186201
df_export["se"] = np.nan
187202
df_export["sample_size"] = np.nan
203+
df_export = add_nancodes(df_export)
188204

189205
if wip:
190206
metric = "wip_" + metric

safegraph_patterns/tests/test_run.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,34 @@ def test_output_files(self, clean_receiving_dir):
2929
csv_files = listdir("receiving")
3030

3131
dates = [
32-
"20190722", "20190723", "20190724", "20190725", "20190726",
33-
"20190727", "20190728", "20190729", "20190730", "20190731",
34-
"20190801", "20190802", "20190803", "20190804",
35-
"20200727", "20200728", "20200729", "20200730", "20200731",
36-
"20200801", "20200802", "20200803", "20200804", "20200805",
37-
"20200806", "20200807", "20200808", "20200809"
32+
"20190722",
33+
"20190723",
34+
"20190724",
35+
"20190725",
36+
"20190726",
37+
"20190727",
38+
"20190728",
39+
"20190729",
40+
"20190730",
41+
"20190731",
42+
"20190801",
43+
"20190802",
44+
"20190803",
45+
"20190804",
46+
"20200727",
47+
"20200728",
48+
"20200729",
49+
"20200730",
50+
"20200731",
51+
"20200801",
52+
"20200802",
53+
"20200803",
54+
"20200804",
55+
"20200805",
56+
"20200806",
57+
"20200807",
58+
"20200808",
59+
"20200809",
3860
]
3961

4062
expected_files = []
@@ -48,7 +70,14 @@ def test_output_files(self, clean_receiving_dir):
4870
assert set(expected_files).issubset(set(csv_files))
4971

5072
# Test output format
51-
df = pd.read_csv(
52-
join("./receiving", "20200729_state_bars_visit_num.csv")
53-
)
54-
assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
73+
df = pd.read_csv(join("./receiving", "20200729_state_bars_visit_num.csv"))
74+
expected_columns = [
75+
"geo_id",
76+
"val",
77+
"se",
78+
"sample_size",
79+
"missing_val",
80+
"missing_se",
81+
"missing_sample_size",
82+
]
83+
assert (df.columns.values == expected_columns).all()

0 commit comments

Comments
 (0)