Skip to content

Commit 5f3d8f0

Browse files
authored
Merge pull request #274 from cmu-delphi/test-and-delete/jhu-nyc-support
Sanity check for linting
2 parents c012908 + d30ab47 commit 5f3d8f0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+104451
-34520
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"static_file_dir": "./static",
3+
"raw_data_dir": "/common/safegraph",
4+
"export_dir": "./receiving",
5+
"cache_dir": "./cache",
6+
"n_core": "12",
7+
"aws_access_key_id": "{{ safegraph_aws_access_key_id }}",
8+
"aws_secret_access_key": "{{ safegraph_aws_secret_access_key }}",
9+
"aws_default_region": "us-east-1",
10+
"aws_endpoint": "https://s3.wasabisys.com",
11+
"wip_signal": ""
12+
}

ansible/vars.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ pyenv_python_path: "/home/{{ runtime_user }}/.pyenv/versions/{{ python_version }
1111
google_health_api_key: "{{ vault_google_health_api_key }}"
1212
delphi_aws_access_key_id: "{{ vault_delphi_aws_access_key_id }}"
1313
delphi_aws_secret_access_key: "{{ vault_delphi_aws_secret_access_key }}"
14-
14+
safegraph_aws_access_key_id: "{{ vault_safegraph_aws_access_key_id }}"
15+
safegraph_aws_secret_access_key: "{{ vault_safegraph_aws_secret_access_key }}"

ansible/vault.yaml

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
11
$ANSIBLE_VAULT;1.1;AES256
2-
66386163643862646634343162646465663762643034303563333833633661333932646164656462
3-
6166646131623132393238336263623562373065643633310a663232373237396361623462613333
4-
62373663383565623263306539636431623230633065626363666531366662363065343066363031
5-
3738616663336665340a326138333634306137363837396366303861663064326333613662656630
6-
62306331646637326637363766366237663037306665343761643263646663316535343561623137
7-
63313365653535393639626465343232396261643239303430383138633135346466323834336665
8-
33633064353034613836313265613466623961373565363835343430373138376336363966316365
9-
35663664396436313432376264316663326130306134326231303234393561643436623039613136
10-
63366638396262383762383336643930343661636461646162653734336334306239383132643435
11-
39333665643738643966356431333830646561353263353063326330643731616130396466343339
12-
39346437653063303336626663623835613938633834396430353634383366386237353862643766
13-
37393738353231666565303031393839306463373461393761653866653330646534393832303264
14-
30323038646166366465396235623731343539313633326539663966333437623733626131653437
15-
62326632656462383835656235373664366566343866383938343639613737623631616231616135
16-
633863383761366461363532353137323936
2+
39633436633363346633333638666438316131313337333132396634393538383432623239316463
3+
3435333238376331383439366161656639353039326163370a376363633535623363383233646533
4+
63363865646561323132663032383331346332373364333465643330616638623466333039623831
5+
6530663236313234360a336264656239383166663934303335386238386139386132626165386138
6+
32663164326237323534636263663263666634383339613362633939323565356437663666653436
7+
31353362316334313561333430626361616337643133346664636434313664373333653839323630
8+
65346331383135656135386263643564333063626563336365333865663333353337393866666139
9+
64613735663363323938633161666662653161633835383832656164343836383339376661396332
10+
66353131373265373931366130383632633466363036373562363232663162333966316563373535
11+
65343336363732303132366335616335333334373063313562336330336661353239646533356461
12+
62313365633336613037626261353639323937363066363062356234653631346233373965636461
13+
63326237663537363338346566326232353632663463386135393535343436373335393430393865
14+
33393631623762636230656263363462346561323064653561393666373735313836666238323238
15+
66366564666266343636663666386566336637373036633966643961346636373066356632326464
16+
63336565656666336436383938346461646431353265353133633736363761623634346262616436
17+
61653633326333356330626638386665313865343233393637623662383634346534326537623662
18+
34326633623431343835346339656335386330333664373166313766366339663736376261343965
19+
63616461666230616131326537373130313239663931313330356538356161333537666237376362
20+
64613232333834303737323438616437303666643166383439393030316533343530363863613034
21+
39653761626439356133393164363561316535633230633438316137623333376633663665393634
22+
63333161376263613766353030616336386531303565346263366239653232333764

emr_hosp/delphi_emr_hosp/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""Registry for signal names and geo types"""
2+
SMOOTHED = "smoothed_covid19"
3+
SMOOTHED_ADJ = "smoothed_adj_covid19"
4+
SIGNALS = [SMOOTHED, SMOOTHED_ADJ]
5+
NA = "NA"
6+
HRR = "hrr"
7+
FIPS = "fips"

emr_hosp/delphi_emr_hosp/run.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def run_module():
6969
logging.info("parallel:\t\t%s", params["parallel"])
7070
logging.info("weekday:\t\t%s", params["weekday"])
7171
logging.info("se:\t\t\t%s", params["se"])
72-
logging.info("prefix:\t\t%s", params["obfuscated_prefix"])
7372

7473
## start generating
7574
for geo in params["geos"]:
@@ -85,8 +84,7 @@ def run_module():
8584
geo,
8685
params["parallel"],
8786
weekday,
88-
params["se"],
89-
params["obfuscated_prefix"]
87+
params["se"]
9088
)
9189
su_inst.update_sensor(
9290
params["input_emr_file"],
Lines changed: 71 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,44 @@
11
"""
22
Generate EMR-hosp sensors.
3-
43
Author: Maria Jahja
54
Created: 2020-06-01
65
"""
7-
86
# standard packages
97
import logging
108
from datetime import timedelta
119
from multiprocessing import Pool, cpu_count
10+
import covidcast
11+
from delphi_utils import read_params
1212

1313
# third party
1414
import numpy as np
1515
import pandas as pd
16-
1716
# first party
1817
from .config import Config, Constants
1918
from .geo_maps import GeoMaps
2019
from .load_data import load_combined_data
2120
from .sensor import EMRHospSensor
2221
from .weekday import Weekday
22+
from .constants import SIGNALS, SMOOTHED, SMOOTHED_ADJ, HRR, NA, FIPS
2323

2424
from delphi_utils import GeoMapper
2525

26-
2726
def write_to_csv(output_dict, write_se, out_name, output_path="."):
2827
"""Write sensor values to csv.
29-
3028
Args:
3129
output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
3230
write_se: boolean to write out standard errors, if true, use an obfuscated name
3331
out_name: name of the output file
3432
output_path: outfile path to write the csv (default is current directory)
3533
"""
36-
3734
if write_se:
3835
logging.info(f"========= WARNING: WRITING SEs TO {out_name} =========")
39-
4036
geo_level = output_dict["geo_level"]
4137
dates = output_dict["dates"]
4238
geo_ids = output_dict["geo_ids"]
4339
all_rates = output_dict["rates"]
4440
all_se = output_dict["se"]
4541
all_include = output_dict["include"]
46-
4742
out_n = 0
4843
for i, d in enumerate(dates):
4944
filename = "%s/%s_%s_%s.csv" % (
@@ -52,33 +47,83 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
5247
geo_level,
5348
out_name,
5449
)
55-
5650
with open(filename, "w") as outfile:
5751
outfile.write("geo_id,val,se,direction,sample_size\n")
58-
5952
for geo_id in geo_ids:
6053
sensor = all_rates[geo_id][i]
6154
se = all_se[geo_id][i]
62-
6355
if all_include[geo_id][i]:
6456
assert not np.isnan(sensor), "value for included sensor is nan"
6557
assert not np.isnan(se), "se for included sensor is nan"
6658
if sensor > 90:
6759
logging.warning(f"value suspiciously high, {geo_id}: {sensor}")
6860
assert se < 5, f"se suspiciously high, {geo_id}: {se}"
69-
7061
if write_se:
7162
assert sensor > 0 and se > 0, "p=0, std_err=0 invalid"
7263
outfile.write(
73-
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, "NA", "NA"))
64+
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, NA, NA))
7465
else:
7566
# for privacy reasons we will not report the standard error
7667
outfile.write(
77-
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, "NA", "NA", "NA")
68+
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, NA, NA, NA)
7869
)
7970
out_n += 1
8071
logging.debug(f"wrote {out_n} rows for {len(geo_ids)} {geo_level}")
8172

73+
74+
def add_prefix(signal_names, wip_signal, prefix="wip_"):
75+
"""Adds prefix to signal if there is a WIP signal
76+
Parameters
77+
----------
78+
signal_names: List[str]
79+
Names of signals to be exported
80+
wip_signal : List[str] or bool
81+
a list of wip signals: [], OR
82+
all signals in the registry: True OR
83+
only signals that have never been published: False
84+
prefix : 'wip_'
85+
prefix for new/non public signals
86+
Returns
87+
-------
88+
List of signal names
89+
wip/non wip signals for further computation
90+
"""
91+
if wip_signal is True:
92+
return [prefix + signal for signal in signal_names]
93+
if isinstance(wip_signal, list):
94+
make_wip = set(wip_signal)
95+
return [
96+
prefix + signal if signal in make_wip else signal
97+
for signal in signal_names
98+
]
99+
if wip_signal in {False, ""}:
100+
return [
101+
signal if public_signal(signal)
102+
else prefix + signal
103+
for signal in signal_names
104+
]
105+
raise ValueError("Supply True | False or '' or [] | list()")
106+
107+
108+
def public_signal(signal_):
109+
"""Checks if the signal name is already public using COVIDcast
110+
Parameters
111+
----------
112+
signal_ : str
113+
Name of the signal
114+
Returns
115+
-------
116+
bool
117+
True if the signal is present
118+
False if the signal is not present
119+
"""
120+
epidata_df = covidcast.metadata()
121+
for index in range(len(epidata_df)):
122+
if epidata_df['signal'][index] == signal_:
123+
return True
124+
return False
125+
126+
82127
class EMRHospSensorUpdator:
83128

84129
def __init__(self,
@@ -88,10 +133,8 @@ def __init__(self,
88133
geo,
89134
parallel,
90135
weekday,
91-
se,
92-
prefix=None):
136+
se):
93137
"""Init Sensor Updator
94-
95138
Args:
96139
startdate: first sensor date (YYYY-mm-dd)
97140
enddate: last sensor date (YYYY-mm-dd)
@@ -100,11 +143,8 @@ def __init__(self,
100143
parallel: boolean to run the sensor update in parallel
101144
weekday: boolean to adjust for weekday effects
102145
se: boolean to write out standard errors, if true, use an obfuscated name
103-
prefix: string to prefix to output files (used for obfuscation in producing SEs)
104-
105146
"""
106147
self.startdate, self.enddate, self.dropdate = [pd.to_datetime(t) for t in (startdate, enddate, dropdate)]
107-
108148
# handle dates
109149
assert (self.startdate > (Config.FIRST_DATA_DATE + Config.BURN_IN_PERIOD)
110150
), f"not enough data to produce estimates starting {self.startdate}"
@@ -114,32 +154,28 @@ def __init__(self,
114154
self.geo, self.parallel, self.weekday, self.se = geo.lower(), parallel, weekday, se
115155

116156
# output file naming
117-
out_name = "smoothed_adj_covid19" if self.weekday else "smoothed_covid19"
118-
if se:
119-
assert prefix is not None, "supply obfuscated prefix in params"
120-
out_name = prefix + "_" + out_name
121-
self.output_filename = out_name
122-
157+
signals = SIGNALS.copy()
158+
signals.remove(SMOOTHED if self.weekday else SMOOTHED_ADJ)
159+
signal_names = add_prefix(
160+
signals,
161+
wip_signal=read_params()["wip_signal"])
162+
self.updated_signal_names = signal_names
123163

124164
def shift_dates(self):
125165
"""shift estimates forward to account for time lag, compute burnindates, sensordates
126166
"""
127-
128167
drange = lambda s, e: pd.date_range(start=s,periods=(e-s).days,freq='D')
129168
self.startdate = self.startdate - Config.DAY_SHIFT
130169
self.burnindate = self.startdate - Config.BURN_IN_PERIOD
131170
self.fit_dates = drange(Config.FIRST_DATA_DATE, self.dropdate)
132171
self.burn_in_dates = drange(self.burnindate, self.dropdate)
133172
self.sensor_dates = drange(self.startdate, self.enddate)
134173
return True
135-
136174
def geo_reindex(self,data):
137175
"""Reindex based on geography, include all date, geo pairs
138-
139176
Args:
140177
data: dataframe, the output of loadcombineddata
141178
staticpath: path for the static geographic files
142-
143179
Returns:
144180
dataframe
145181
"""
@@ -157,92 +193,80 @@ def geo_reindex(self,data):
157193
else:
158194
logging.error(f"{geo} is invalid, pick one of 'county', 'state', 'msa', 'hrr'")
159195
return False
160-
161196
self.unique_geo_ids = pd.unique(data_frame[geo])
162197
data_frame.set_index([geo,'date'],inplace=True)
163-
164198
# for each location, fill in all missing dates with 0 values
165199
multiindex = pd.MultiIndex.from_product((self.unique_geo_ids, self.fit_dates),
166200
names=[geo, "date"])
167201
assert (len(multiindex) <= (Constants.MAX_GEO[geo] * len(self.fit_dates))
168202
), "more loc-date pairs than maximum number of geographies x number of dates"
169-
170203
# fill dataframe with missing dates using 0
171204
data_frame = data_frame.reindex(multiindex, fill_value=0)
172205
data_frame.fillna(0, inplace=True)
173206
return data_frame
174207

175208

209+
176210
def update_sensor(self,
177211
emr_filepath,
178212
claims_filepath,
179213
outpath,
180214
staticpath):
181215
"""Generate sensor values, and write to csv format.
182-
183216
Args:
184217
emr_filepath: path to the aggregated EMR data
185218
claims_filepath: path to the aggregated claims data
186219
outpath: output path for the csv results
187220
staticpath: path for the static geographic files
188221
"""
189-
190222
self.shift_dates()
191223
final_sensor_idxs = (self.burn_in_dates >= self.startdate) & (self.burn_in_dates <= self.enddate)
192224

193225
# load data
194226
## JS: If the data is in fips then can we also put it into hrr?
195227
base_geo = "hrr" if self.geo == "hrr" else "fips"
228+
base_geo = HRR if self.geo == HRR else FIPS
196229
data = load_combined_data(emr_filepath, claims_filepath, self.dropdate, base_geo)
197230

198231
data.reset_index(inplace=True)
199232
data_frame = self.geo_reindex(data)
200-
201233
# handle if we need to adjust by weekday
202234
wd_params = Weekday.get_params(data_frame) if self.weekday else None
203-
204235
# run sensor fitting code (maybe in parallel)
205236
sensor_rates = {}
206237
sensor_se = {}
207238
sensor_include = {}
208239
if not self.parallel:
209240
for geo_id, sub_data in data_frame.groupby(level=0):
210241
sub_data.reset_index(level=0,inplace=True)
211-
212242
if self.weekday:
213243
sub_data = Weekday.calc_adjustment(wd_params, sub_data)
214-
215244
res = EMRHospSensor.fit(sub_data, self.burnindate, geo_id)
216245
res = pd.DataFrame(res)
217246
sensor_rates[geo_id] = np.array(res.loc[final_sensor_idxs,"rate"])
218247
sensor_se[geo_id] = np.array(res.loc[final_sensor_idxs,"se"])
219248
sensor_include[geo_id] = np.array(res.loc[final_sensor_idxs,"incl"])
220-
221249
else:
222250
n_cpu = min(10, cpu_count())
223251
logging.debug(f"starting pool with {n_cpu} workers")
224-
225252
with Pool(n_cpu) as pool:
226253
pool_results = []
227254
for geo_id, sub_data in data_frame.groupby(level=0,as_index=False):
228255
sub_data.reset_index(level=0, inplace=True)
229256
if self.weekday:
230257
sub_data = Weekday.calc_adjustment(wd_params, sub_data)
231-
232258
pool_results.append(
233259
pool.apply_async(
234260
EMRHospSensor.fit, args=(sub_data, self.burnindate, geo_id,),
235261
)
236262
)
237263
pool_results = [proc.get() for proc in pool_results]
238-
239264
for res in pool_results:
240265
geo_id = res["geo_id"]
241266
res = pd.DataFrame(res)
242267
sensor_rates[geo_id] = np.array(res.loc[final_sensor_idxs, "rate"])
243268
sensor_se[geo_id] = np.array(res.loc[final_sensor_idxs, "se"])
244269
sensor_include[geo_id] = np.array(res.loc[final_sensor_idxs, "incl"])
245-
246270
unique_geo_ids = list(sensor_rates.keys())
247271
output_dict = {
248272
"rates": sensor_rates,
@@ -254,6 +278,7 @@ def update_sensor(self,
254278
}
255279

256280
# write out results
257-
write_to_csv(output_dict, self.se, self.output_filename, outpath)
281+
for signal in self.updated_signal_names:
282+
write_to_csv(output_dict, self.se, signal, outpath)
258283
logging.debug(f"wrote files to {outpath}")
259-
return True
284+
return True

0 commit comments

Comments
 (0)