Skip to content

Commit 1df3553

Browse files
committed
NANs Safegraph Patterns:
* add missing columns to the export dataframe * basic support and unknown reason missingness detection
1 parent f45cb9c commit 1df3553

File tree

3 files changed

+64
-22
lines changed

3 files changed

+64
-22
lines changed

safegraph_patterns/delphi_safegraph_patterns/process.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77
import pandas as pd
88

9-
from delphi_utils import create_export_csv, GeoMapper
9+
from delphi_utils import create_export_csv, GeoMapper, NAN_CODES
1010

1111
INCIDENCE_BASE = 100000
1212

@@ -124,6 +124,21 @@ def aggregate(df, metric, geo_res):
124124
* INCIDENCE_BASE
125125
return df.rename({geo_key: "geo_id"}, axis=1)
126126

127+
def add_nancodes(df):
128+
"""Add nan codes to the df."""
129+
# Values are not missing unless found to be null
130+
df["missing_val"] = NAN_CODES["Not Missing"]
131+
132+
# These values aren't reported, so are always missing
133+
df["missing_se"] = NAN_CODES["Not Applicable"]
134+
df["missing_sample_size"] = NAN_CODES["Not Applicable"]
135+
136+
# Mark any remaining nans with unknown
137+
remaining_nans_mask = (df["missing_val"] == 0) & df["val"].isnull()
138+
df.loc[remaining_nans_mask, "missing_val"] = NAN_CODES["Unknown"]
139+
140+
return df
141+
127142
def process(fname, sensors, metrics, geo_resolutions,
128143
export_dir, brand_df):
129144
"""
@@ -185,6 +200,7 @@ def process(fname, sensors, metrics, geo_resolutions,
185200
df_export["val"] = df_export["_".join([metric, sensor])]
186201
df_export["se"] = np.nan
187202
df_export["sample_size"] = np.nan
203+
df_export = add_nancodes(df_export)
188204

189205
if wip:
190206
metric = "wip_" + metric

safegraph_patterns/tests/test_process.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
metric_names, naics_codes, _ = (list(x) for x in zip(*METRICS))
1515

1616
brand_df = pd.read_csv(
17-
join("./static", f"brand_info/brand_info_202004.csv")
17+
join("./static", "brand_info/brand_info_202004.csv")
1818
)
1919

2020
class TestProcess:
@@ -23,31 +23,29 @@ def test_construct_signals_present(self):
2323
df = pd.read_csv('test_data/sample_raw_data.csv',
2424
parse_dates=["date_range_start", "date_range_end"])
2525
dfs = construct_signals(df, metric_names, naics_codes, brand_df)
26-
assert set(["timestamp", "zip",
26+
assert set(["timestamp", "zip",
2727
"bars_visit_num"]) == set(dfs["bars_visit"].columns)
2828
assert set(["timestamp", "zip", "restaurants_visit_num"]) == \
2929
set(dfs["restaurants_visit"].columns)
3030
assert dfs["bars_visit"]["timestamp"].unique().shape[0] == 7
3131
assert dfs["restaurants_visit"]["timestamp"].unique().shape[0] == 7
3232

3333
def test_aggregate_county(self):
34-
34+
3535
df = pd.read_csv('test_data/sample_filtered_data.csv', parse_dates=["timestamp"])
3636
df_export = aggregate(df, "bars_visit", "county")
3737

3838
assert np.all(df_export["bars_visit_num"].values >= 0)
3939
assert np.all(df_export["bars_visit_prop"].dropna().values <= INCIDENCE_BASE)
40-
assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop",
40+
assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop",
4141
"population"]) == set(df_export.columns)
4242

4343
def test_aggregate_state(self):
44-
44+
4545
df = pd.read_csv('test_data/sample_filtered_data.csv', parse_dates=["timestamp"])
4646
df_export = aggregate(df, "bars_visit", "state")
4747

4848
assert np.all(df_export["bars_visit_num"].values >= 0)
4949
assert np.all(df_export["bars_visit_prop"].dropna().values <= INCIDENCE_BASE)
50-
assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop",
50+
assert set(["timestamp", "geo_id", "bars_visit_num", "bars_visit_prop",
5151
"population"]) == set(df_export.columns)
52-
53-

safegraph_patterns/tests/test_run.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33

44
import pandas as pd
55

6-
from delphi_safegraph_patterns.run import (run_module, METRICS,
7-
SENSORS, GEO_RESOLUTIONS)
8-
6+
from delphi_safegraph_patterns.run import run_module, METRICS, SENSORS, GEO_RESOLUTIONS
7+
98

109
class TestRun:
1110
def test_output_files(self, run_as_module):
@@ -14,12 +13,34 @@ def test_output_files(self, run_as_module):
1413
csv_files = listdir("receiving")
1514

1615
dates = [
17-
"20190722", "20190723", "20190724", "20190725", "20190726",
18-
"20190727", "20190728", "20190729", "20190730", "20190731",
19-
"20190801", "20190802", "20190803", "20190804",
20-
"20200727", "20200728", "20200729", "20200730", "20200731",
21-
"20200801", "20200802", "20200803", "20200804", "20200805",
22-
"20200806", "20200807", "20200808", "20200809"
16+
"20190722",
17+
"20190723",
18+
"20190724",
19+
"20190725",
20+
"20190726",
21+
"20190727",
22+
"20190728",
23+
"20190729",
24+
"20190730",
25+
"20190731",
26+
"20190801",
27+
"20190802",
28+
"20190803",
29+
"20190804",
30+
"20200727",
31+
"20200728",
32+
"20200729",
33+
"20200730",
34+
"20200731",
35+
"20200801",
36+
"20200802",
37+
"20200803",
38+
"20200804",
39+
"20200805",
40+
"20200806",
41+
"20200807",
42+
"20200808",
43+
"20200809",
2344
]
2445

2546
expected_files = []
@@ -33,7 +54,14 @@ def test_output_files(self, run_as_module):
3354
assert set(expected_files).issubset(set(csv_files))
3455

3556
# Test output format
36-
df = pd.read_csv(
37-
join("./receiving", "20200729_state_bars_visit_num.csv")
38-
)
39-
assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
57+
df = pd.read_csv(join("./receiving", "20200729_state_bars_visit_num.csv"))
58+
expected_columns = [
59+
"geo_id",
60+
"val",
61+
"se",
62+
"sample_size",
63+
"missing_val",
64+
"missing_se",
65+
"missing_sample_size",
66+
]
67+
assert (df.columns.values == expected_columns).all()

0 commit comments

Comments
 (0)