Skip to content

Commit 447d07a

Browse files
Ananya Joshidshemetov
authored and
Ananya Joshi
committed
Squash Commits from PR testing
Update _delphi_utils_python/delphi_utils/flash_eval/eval_day.py Co-authored-by: Dmitry Shemetov <[email protected]>
1 parent fa2f441 commit 447d07a

File tree

14 files changed

+85
-115
lines changed

14 files changed

+85
-115
lines changed
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
# -*- coding: utf-8 -*-
21
"""Call the function run_module when executed.
32
43
This file indicates that calling the module (`python -m MODULE_NAME`) will
54
call the function `run_module` found within the run.py file. There should be
65
no need to change this template.
76
"""
87

9-
from .run import run_module
8+
from delphi_utils import read_params
9+
from .run import run_module # pragma: no cover
1010

11-
run_module()
11+
run_module(read_params()) # pragma: no cover

_delphi_utils_python/delphi_utils/flash_eval/constants.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,3 @@
1010

1111
#HTML Link for the visualization tool alerts
1212
HTML_LINK = "<https://ananya-joshi-visapp-vis-523f3g.streamlitapp.com/?params="
13-
14-
#Bucket for AWS
15-
BUCKET = 'delphi-covidcast-public'

_delphi_utils_python/delphi_utils/flash_eval/eval_day.py

Lines changed: 70 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from scipy.stats import binom
66
import boto3
77
from delphi_utils.weekday import Weekday
8-
from .constants import HTML_LINK, STATES, BUCKET
8+
from .constants import HTML_LINK, STATES
99
from .. import (
1010
get_structured_logger,
1111
)
@@ -30,7 +30,8 @@ def split_reporting_schedule_dfs(input_df, flash_dir, lag):
3030
rep_sched = rep_sched.drop('min_cut')
3131
glob_out_list = []
3232
non_daily_ar = []
33-
for i, df in rep_sched.groupby('0'):
33+
rep_sched.columns = ['schedule']
34+
for i, df in rep_sched.groupby('schedule'):
3435
fixed_sum = []
3536
columns = []
3637
for col in input_df.columns:
@@ -48,50 +49,53 @@ def split_reporting_schedule_dfs(input_df, flash_dir, lag):
4849
glob_out_list.append(fixed_sum_df)
4950
else:
5051
non_daily_ar.append(fixed_sum_df)
51-
return daily_df, pd.concat(non_daily_ar,axis=1) , pd.concat(glob_out_list, axis=1)
52+
return (daily_df, pd.concat(non_daily_ar,axis=1) , pd.concat(glob_out_list, axis=1))
5253

5354

54-
def bin_approach(y, yhat, pop, log=False):
55+
def bin_approach(df, log=False):
5556
"""Create test statistic.
5657
5758
Parameters
5859
----------
60+
df with columns of
5961
y: observed values for streams
6062
yhat: predicted values for streams
6163
pop: population for a region
62-
log: difference between reporting and reference date
64+
65+
log: taking the log for the test statistic measure
6366
6467
Returns
6568
-------
6669
today's test-statistic values for the stream
6770
"""
6871
def ts_dist(x, y, n):
6972
"""Initialize test statistic distribution which is then vectorized."""
70-
# print(x, y, y/n, n, binom.cdf(x, int(n), y/n)
71-
return binom.cdf(x, int(n), y/n)
73+
return binom.cdf(x, int(n), y / n)
74+
7275
vec_ts_dist = np.vectorize(ts_dist)
7376
if log:
74-
return vec_ts_dist(np.log(y+2), np.log(yhat+2), np.log(pd.Series(pop)+2))
75-
return vec_ts_dist(y, yhat, pop)
77+
return pd.DataFrame(vec_ts_dist(np.log(df.y + 2),
78+
np.log(df.yhat + 2), np.log(df['pop'] + 2)),
79+
index=df.index)
80+
return pd.DataFrame(vec_ts_dist(df.y, df.yhat, df.pop), index=df.index)
81+
7682

7783

78-
def global_outlier_detect(df, mean, var):
84+
85+
def outlier_detect(df):
7986
"""Determine global outliers by using abs(t-statistic) > 5.
8087
8188
Parameters
8289
----------
83-
df: Current df to evaluate for global outliers
84-
mean: Mean needed for t-statistic calculation
85-
var: Variance for t-statistic calculation
90+
df: Current df to evaluate for global outliers with columns
91+
for mean and var.
8692
8793
Returns
8894
-------
8995
The columns that are global outliers.
9096
"""
91-
all_columns = list(df.columns)
92-
mean = mean[all_columns]
93-
var = var[all_columns]
94-
return df.columns[((abs(df.T.iloc[:, 0].sort_index() - mean.sort_index())/var.clip(1)).gt(5))]
97+
df.columns = ['x', 'mean', 'var']
98+
return df.index[((abs(df['x'] - df['mean']) / (df['var'].clip(1))).gt(5))]
9599

96100
def apply_ar(last_7, flash_dir, lag, weekday_correction, non_daily_df, fips_pop_table):
97101
"""Predict y_hat using an AR model.
@@ -109,13 +113,17 @@ def apply_ar(last_7, flash_dir, lag, weekday_correction, non_daily_df, fips_pop_
109113
-------
110114
ts_streams: return of test statistic for the day's steams.
111115
"""
112-
lin_coeff = pd.read_csv(f'{flash_dir}/lin_coeff_{lag}.csv', index_col=0)
113116
y = pd.concat([weekday_correction, non_daily_df], axis=1)
114-
y_hat = pd.Series([np.dot(lin_coeff[x], last_7[x]) for x in y.columns])
115-
ts_streams = pd.DataFrame(bin_approach(y, y_hat,
116-
list(fips_pop_table[y.columns].iloc[0, :]), log=True),
117-
columns=y.columns)
118-
return ts_streams
117+
y.name = 'y'
118+
lin_coeff = pd.read_csv(f'{flash_dir}/lin_coeff_{lag}.csv', index_col=0)
119+
y_hat = pd.Series([np.dot(lin_coeff[x], last_7[x]) for x in y.columns], name='yhat')
120+
y_hat.index = y.columns
121+
df_for_ts = y.T.merge(y_hat, left_index=True, right_index=True).merge(fips_pop_table.T
122+
, left_index=True, right_index=True)
123+
df_for_ts.columns = ['y', 'yhat', 'pop']
124+
125+
return df_for_ts, bin_approach(df_for_ts, log=True)
126+
119127

120128
def output(evd_ranking, day, lag, signal, logger):
121129
"""Write the top streams that warrant human inspection to the log.
@@ -138,6 +146,8 @@ def output(evd_ranking, day, lag, signal, logger):
138146
if j < 30:
139147
start_link = f"{starter_link},{day.strftime('%Y-%m_%d')},{index}"
140148
p_text += f"\t{start_link}|*{index}*, {'{:.2f}'.format(value)}>\n"
149+
else:
150+
break
141151
name = f"Signal: {signal} Lag: {lag}"
142152
logger.info(name, payload=p_text)
143153

@@ -157,10 +167,10 @@ def evd_ranking_fn(ts_streams, flash_dir):
157167
EVD_max = pd.read_csv(f'{flash_dir}/max.csv', index_col=0)
158168
EVD_min = pd.read_csv(f'{flash_dir}/min.csv', index_col=0)
159169
evd_ranking = pd.concat(
160-
[ts_streams.apply(lambda x: sum(x.values[0] <= EVD_min['0'])
161-
/ EVD_min['0'].shape[0], axis=0).sort_values(),
162-
ts_streams.apply(lambda x: sum(x.values[0] >= EVD_max['0'])
163-
/ EVD_max['0'].shape[0], axis=0).sort_values()],
170+
[ts_streams.apply(lambda x: ts_val(x.values[0],
171+
EVD_min['0']), axis=1).sort_values(),
172+
ts_streams.apply(lambda x :1-ts_val(x.values[0],
173+
EVD_max['0']), axis=1).sort_values()],
164174
axis=1).max(axis=1)
165175
evd_ranking.name = 'evd_ranking'
166176
return evd_ranking
@@ -189,7 +199,7 @@ def streams_groups_fn(stream, ts_streams):
189199
for col, val in ts_streams.T.iterrows():
190200
if key == col[:2]:
191201
total_dist = pd.concat([group[0], streams_state]).reset_index(drop=True)
192-
ranking_streams[col] = sum(total_dist < val[0]) / total_dist.shape[0]
202+
ranking_streams[col] = ts_val(val[0], total_dist)
193203
stream_group = pd.Series(ranking_streams, name='stream_group')
194204
return stream_group
195205

@@ -213,6 +223,22 @@ def setup_fips(flash_dir):
213223
fips_pop_table.columns = [STATE_to_fips[x] if x in list(STATES)
214224
else x for x in fips_pop_table.columns.droplevel()]
215225
return STATE_to_fips, fips_pop_table
226+
227+
228+
def ts_val(val, dist):
229+
"""Determine p-value from the test statistic distribution.
230+
231+
Parameters
232+
----------
233+
val: The test statistic
234+
dist: The distribution to compare to
235+
236+
Returns: p-value
237+
-------
238+
239+
"""
240+
return sum(val <= dist) / dist.shape[0]
241+
216242
def flash_eval(lag, day, input_df, signal, params, logger=None):
217243
"""Evaluate most recent data using FlaSH.
218244
@@ -224,20 +250,20 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
224250
Ouput:
225251
None
226252
"""
227-
228253
if not logger:
229254
logger = get_structured_logger(
230255
name=signal,
231256
filename=params["common"].get("log_filename", None),
232257
log_exceptions=params["common"].get("log_exceptions", True))
233258

234-
#TODOv4: Change these to a local dir
259+
#TODOv4: Change these to a local dir or aws
235260
flash_dir = f'flash_ref/{signal}'
236261
last_7 = pd.read_csv(f'{flash_dir}/last_7_{lag}.csv', index_col=0).astype(float)
237262
wk_mean = pd.read_csv(f'{flash_dir}/weekday_mean_df_{lag}.csv', index_col=0)
238263
wk_var = pd.read_csv(f'{flash_dir}/weekday_var_df_{lag}.csv', index_col=0)
239264
weekday_params = pd.read_csv(f'{flash_dir}/weekday_params_{lag}.csv', index_col=0)
240265
summary_stats = pd.read_csv(f'{flash_dir}/summary_stats_{lag}.csv', index_col=0)
266+
summary_stats.index = ['0.25', 'median', '0.75', 'mean', 'var']
241267
stream = pd.read_csv(f'{flash_dir}/ret_df2_{lag}.csv', index_col=0)
242268

243269

@@ -256,10 +282,9 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
256282
daily_update_df, non_daily_df_test, non_ar_df = split_reporting_schedule_dfs(input_df,
257283
flash_dir, lag)
258284
# Weekday outlier [only for Daily Df]
259-
weekday_outlier = daily_update_df.columns[((abs(
260-
daily_update_df.T.sort_index()[day] - wk_mean.loc[day.day_of_week,
261-
daily_update_df.columns].sort_index()) /wk_var.loc[day.day_of_week,
262-
daily_update_df.columns].clip(1)).gt(5))]
285+
weekday_outlier = outlier_detect(daily_update_df.T.merge(wk_mean.loc[day.day_of_week, :],
286+
left_index=True, right_index=True).merge(wk_var.loc[day.day_of_week, :],
287+
left_index=True, right_index=True))
263288

264289
# Make weekday correction for daily update
265290
additive_factor = summary_stats[daily_update_df.columns].iloc[4, :]
@@ -271,17 +296,18 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
271296

272297
global_outlier_list = []
273298
for df in [weekday_correction, non_daily_df_test, non_ar_df]:
274-
global_outlier_list += list(
275-
global_outlier_detect(df, summary_stats[df.columns].iloc[2],
276-
summary_stats[df.columns].iloc[4]))
299+
global_outlier_list+=list(outlier_detect(df.T.merge(summary_stats[df.columns].loc['median'
300+
,:],left_index=True, right_index=True
301+
).merge(summary_stats[df.columns].loc['var',:],
302+
left_index=True, right_index=True)))
277303

278304
# Apply AR
279-
ts_streams = apply_ar(last_7, flash_dir, lag, weekday_correction,
305+
ts_streams, df_for_ts = apply_ar(last_7, flash_dir, lag, weekday_correction,
280306
non_daily_df_test, fips_pop_table)
281307
# find stream ranking (individual)
282-
stream_individual = ts_streams.apply(
283-
lambda x: sum(x.values[0] <= stream[x.name].dropna()) /
284-
stream[x.name].dropna().shape[0], axis=0)
308+
stream_individual = ts_streams.T.apply(lambda x: ts_val(x.values[0],
309+
stream[x.name].dropna()))
310+
285311
stream_individual.name = 'stream_individual'
286312

287313

@@ -306,15 +332,16 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
306332
how='outer').merge(stream_group,
307333
left_index=True, right_index=True, how='outer').merge(evd_ranking,
308334
left_index=True, right_index=True, how='outer'
309-
)
335+
).merge(df_for_ts, left_index=True,
336+
right_index=True, how='outer')
310337
#if aws parameters are passed, save this dataframe to AWS
311338
if params.get('archive', None):
312339
if params['archive'].get("aws_credentials", None):
313340
session = boto3.Session(
314341
aws_access_key_id=params['archive']['aws_credentials']["aws_access_key_id"],
315342
aws_secret_access_key=params['archive']['aws_credentials']["aws_secret_access_key"])
316343
s3 = session.resource('s3')
317-
s3.Object(BUCKET,
344+
s3.Object(params['flash']["aws_bucket"],
318345
f'flags-dev/flash_results/{signal}_{day.strftime("%m_%d_%Y")}_{lag}.csv').put(
319346
Body=type_of_outlier.to_csv(), ACL='public-read')
320347

_delphi_utils_python/delphi_utils/flash_eval/params.json.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"flash": {
3+
"aws_bucket": "{{ flash_aws_bucket_name }}",
34
"signals": ["confirmed_incidence_num"],
45
"lags": ["1"],
56
"support": ["0", "400000000"]

_delphi_utils_python/delphi_utils/flash_eval/run.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,32 @@
66
"""
77
from datetime import date
88
import pandas as pd
9-
from .. import read_params
109
from .eval_day import flash_eval
1110
from ..validator.datafetcher import read_filenames, load_csv
1211

13-
def run_module():
12+
def run_module(params):
1413
"""Run the FlaSH module.
1514
1615
The parameters dictionary must include the signals and signals.
1716
We are only considering lag-1 data.
1817
"""
19-
params = read_params()
2018
signals = params["flash"]["signals"]
2119
for signal in signals:
2220
export_files = read_filenames(params["common"]["export_dir"])
2321
days = {}
24-
#Concat the data from recent files at nation, state, and county resolution per day.
2522
for (x, _) in export_files:
23+
#Concat the data from recent files at nation, state, and county resolution per day.
2624
if signal in x and pd.Series([y in x for y in ['state', 'county', 'nation']]).any():
2725
day = pd.to_datetime(x.split('_')[0], format="%Y%m%d", errors='raise')
2826
days[day] = pd.concat([days.get(day, pd.DataFrame()),
2927
load_csv(f"{params['common']['export_dir']}/{x}")])
30-
for day, input_df in days.items():
28+
29+
for day, input_df in dict(sorted(days.items())).items():
3130
input_df = input_df[['geo_id', 'val']].set_index('geo_id').T
3231
input_df.index = [day]
3332
today = date.today()
3433
lag= (pd.to_datetime(today)-pd.to_datetime(day)).days
35-
#test case for inital flash implementation assume lag == 1
34+
# inital flash implementation assume lag == 1 always
3635
#if str(lag) in params["flash"]["lags"]:
37-
if True:
38-
lag=1
39-
flash_eval(int(lag), day, input_df, signal, params)
36+
lag=1
37+
flash_eval(int(lag), day, input_df, signal, params)

_delphi_utils_python/delphi_utils/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def run_indicator_pipeline(indicator_fn: Callable[[Params], None],
6767
archiver = archiver_fn(params)
6868

6969
if flash_fn:
70-
t = threading.Timer(timer, flash_fn)
70+
t = threading.Timer(timer, flash_fn(params))
7171
t.start()
7272
t.join(timer)
7373
if t.is_alive():

_delphi_utils_python/tests/flash_ref/last_7_1.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_2.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_3.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_4.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_5.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_6.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_7.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

jhu/params.json.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,11 @@
5050
"indicator_prefix": "jhu"
5151
},
5252
"flash": {
53+
"aws_bucket": "{{ flash_aws_bucket_name }}",
5354
"signals": ["confirmed_incidence_num"],
5455
"lags": ["1"],
5556
"support": ["0", "400000000"]
57+
5658
}
59+
5760
}

0 commit comments

Comments
 (0)