Skip to content

Commit 9208ce0

Browse files
committed
NANs Safegraph:
* add missingness columns to safegraph * add data insufficient if the stderr is missing * add tests
1 parent e105aec commit 9208ce0

File tree

3 files changed

+75
-18
lines changed

3 files changed

+75
-18
lines changed

safegraph/delphi_safegraph/process.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@
66
from typing import List
77
import numpy as np
88
import pandas as pd
9-
from delphi_utils.signal import add_prefix
10-
from delphi_utils.export import create_export_csv
11-
from delphi_utils.geomap import GeoMapper
9+
from delphi_utils import (
10+
add_prefix,
11+
create_export_csv,
12+
GeoMapper,
13+
Nans,
14+
)
1215

1316
from .constants import HOME_DWELL, COMPLETELY_HOME, FULL_TIME_WORK, PART_TIME_WORK, GEO_RESOLUTIONS
1417

@@ -183,7 +186,7 @@ def process_window(df_list: List[pd.DataFrame],
183186
184187
Parameters
185188
----------
186-
cbg_df: pd.DataFrame
189+
df_list: pd.DataFrame
187190
list of census block group-level frames.
188191
signal_names: List[str]
189192
signal names to be processed
@@ -203,15 +206,34 @@ def process_window(df_list: List[pd.DataFrame],
203206
for geo_res in geo_resolutions:
204207
aggregated_df = aggregate(cbg_df, signal_names, geo_res)
205208
for signal in signal_names:
209+
columns_to_export = (
210+
['geo_id'] +
211+
[f'{signal}_{x}' for x in ('mean', 'se', 'n')]
212+
)
206213
df_export = aggregated_df[
207-
['geo_id']
208-
+ [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
214+
columns_to_export
209215
].rename({
210216
f'{signal}_mean': 'val',
211217
f'{signal}_se': 'se',
212218
f'{signal}_n': 'sample_size',
213219
}, axis=1)
214220
df_export["timestamp"] = date.strftime('%Y%m%d')
221+
222+
# Default missingness codes
223+
df_export["missing_val"] = Nans.NOT_MISSING
224+
df_export["missing_se"] = Nans.NOT_MISSING
225+
# Sample size will never be missing in this indicator
226+
# since sample_size just counts the presence of rows for a geo region
227+
df_export["missing_sample_size"] = Nans.NOT_MISSING
228+
# Add missingness codes as detected
229+
# This may occur if all the values are missing for a geographic region
230+
remaining_nans_mask = df_export["val"].isnull()
231+
df_export.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
232+
# This may occur if all the values are missing for a geographic region
233+
# or if the sample size is 1
234+
remaining_nans_mask = df_export["se"].isnull()
235+
df_export.loc[remaining_nans_mask, "missing_se"] = Nans.PRIVACY
236+
215237
create_export_csv(df_export,
216238
export_dir,
217239
geo_res,

safegraph/tests/test_process.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
process_window
1616
)
1717
from delphi_safegraph.run import SIGNALS
18-
18+
from delphi_utils import Nans
1919

2020

2121
class TestProcess:
@@ -150,7 +150,10 @@ def test_process_window(self, tmp_path):
150150
'geo_id': [1053, 1073],
151151
'val': [0.04, 0.14],
152152
'se': [0.02, 0.10],
153-
'sample_size': [2, 2]
153+
'sample_size': [2, 2],
154+
'missing_val': [0, 0],
155+
'missing_se': [0, 0],
156+
'missing_sample_size': [0, 0],
154157
})
155158
actual = pd.read_csv(
156159
export_dir / '20200214_county_completely_home_prop.csv')
@@ -178,49 +181,73 @@ def test_process(self, tmp_path):
178181
'geo_id': ['al', 'ga'],
179182
'val': [6, 3.5],
180183
'se': [None, 0.5],
181-
'sample_size': [1, 2]
184+
'sample_size': [1, 2],
185+
'missing_val': [Nans.NOT_MISSING]*2,
186+
'missing_se': [Nans.PRIVACY, Nans.NOT_MISSING],
187+
'missing_sample_size': [Nans.NOT_MISSING]*2,
182188
}),
183189
'completely_home_prop': pd.DataFrame(data={
184190
'geo_id': ['al', 'ga'],
185191
'val': [0.15, 0.055],
186192
'se': [None, 0.005],
187-
'sample_size': [1, 2]
193+
'sample_size': [1, 2],
194+
'missing_val': [Nans.NOT_MISSING]*2,
195+
'missing_se': [Nans.PRIVACY, Nans.NOT_MISSING],
196+
'missing_sample_size': [Nans.NOT_MISSING]*2,
188197
}),
189198
'part_time_work_prop': pd.DataFrame(data={
190199
'geo_id': ['al', 'ga'],
191200
'val': [0.35, 0.055],
192201
'se': [None, 0.005],
193-
'sample_size': [1, 2]
202+
'sample_size': [1, 2],
203+
'missing_val': [Nans.NOT_MISSING]*2,
204+
'missing_se': [Nans.PRIVACY, Nans.NOT_MISSING],
205+
'missing_sample_size': [Nans.NOT_MISSING]*2,
194206
}),
195207
'full_time_work_prop': pd.DataFrame(data={
196208
'geo_id': ['al', 'ga'],
197209
'val': [0.45, 0.055],
198210
'se': [None, 0.005],
199-
'sample_size': [1, 2]
211+
'sample_size': [1, 2],
212+
'missing_val': [Nans.NOT_MISSING]*2,
213+
'missing_se': [Nans.PRIVACY, Nans.NOT_MISSING],
214+
'missing_sample_size': [Nans.NOT_MISSING]*2,
200215
}),
201216
'median_home_dwell_time_7dav': pd.DataFrame(data={
202217
'geo_id': ['al', 'ga', 'pa'],
203218
'val': [4.5, 3.5, 7.5],
204219
'se': [1.5, 0.5, 0.5],
205-
'sample_size': [2, 2, 2]
220+
'sample_size': [2, 2, 2],
221+
'missing_val': [Nans.NOT_MISSING]*3,
222+
'missing_se': [Nans.NOT_MISSING]*3,
223+
'missing_sample_size': [Nans.NOT_MISSING]*3,
206224
}),
207225
'wip_completely_home_prop_7dav': pd.DataFrame(data={
208226
'geo_id': ['al', 'ga', 'pa'],
209227
'val': [0.1, 0.055, 0.15],
210228
'se': [0.05, 0.005, 0.05],
211-
'sample_size': [2, 2, 2]
229+
'sample_size': [2, 2, 2],
230+
'missing_val': [Nans.NOT_MISSING]*3,
231+
'missing_se': [Nans.NOT_MISSING]*3,
232+
'missing_sample_size': [Nans.NOT_MISSING]*3,
212233
}),
213234
'part_time_work_prop_7dav': pd.DataFrame(data={
214235
'geo_id': ['al', 'ga', 'pa'],
215236
'val': [0.25, 0.055, 0.25],
216237
'se': [0.1, 0.005, 0.05],
217-
'sample_size': [2, 2, 2]
238+
'sample_size': [2, 2, 2],
239+
'missing_val': [Nans.NOT_MISSING]*3,
240+
'missing_se': [Nans.NOT_MISSING]*3,
241+
'missing_sample_size': [Nans.NOT_MISSING]*3,
218242
}),
219243
'full_time_work_prop_7dav': pd.DataFrame(data={
220244
'geo_id': ['al', 'ga', 'pa'],
221245
'val': [0.35, 0.055, 0.35],
222246
'se': [0.1, 0.005, 0.05],
223-
'sample_size': [2, 2, 2]
247+
'sample_size': [2, 2, 2],
248+
'missing_val': [Nans.NOT_MISSING]*3,
249+
'missing_se': [Nans.NOT_MISSING]*3,
250+
'missing_sample_size': [Nans.NOT_MISSING]*3,
224251
})
225252
}
226253
actual = {signal: pd.read_csv(

safegraph/tests/test_run.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,5 +59,13 @@ def test_output_files_format(self, clean_receiving_dir):
5959
# triggered the error.
6060
print(filename)
6161
df = pd.read_csv(os.path.join("receiving", filename))
62-
assert (df.columns.values ==
63-
["geo_id", "val", "se", "sample_size"]).all()
62+
expected_columns = [
63+
"geo_id",
64+
"val",
65+
"se",
66+
"sample_size",
67+
"missing_val",
68+
"missing_se",
69+
"missing_sample_size"
70+
]
71+
assert (df.columns.values == expected_columns).all()

0 commit comments

Comments
 (0)