Skip to content

Commit 33063b5

Browse files
Ananya JoshiAnanya Joshi
Ananya Joshi
authored and
Ananya Joshi
committed
some edge cases with fips/only one df for concat mean to use the optimizations, more work is needed. Saved for a future iteration
added files added back in double quotes for json files quotes for json file quotes for json file lint error lint error minor changes to pass lint lint and testing changes remove extraneous files
1 parent 6f71019 commit 33063b5

File tree

14 files changed

+83
-115
lines changed

14 files changed

+83
-115
lines changed
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
# -*- coding: utf-8 -*-
21
"""Call the function run_module when executed.
32
43
This file indicates that calling the module (`python -m MODULE_NAME`) will
54
call the function `run_module` found within the run.py file. There should be
65
no need to change this template.
76
"""
87

9-
from .run import run_module
8+
from delphi_utils import read_params
9+
from .run import run_module # pragma: no cover
1010

11-
run_module()
11+
run_module(read_params()) # pragma: no cover

_delphi_utils_python/delphi_utils/flash_eval/constants.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,3 @@
1010

1111
#HTML Link for the visualization tool alerts
1212
HTML_LINK = "<https://ananya-joshi-visapp-vis-523f3g.streamlitapp.com/?params="
13-
14-
#Bucket for AWS
15-
BUCKET = 'delphi-covidcast-public'

_delphi_utils_python/delphi_utils/flash_eval/eval_day.py

Lines changed: 68 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from scipy.stats import binom
66
import boto3
77
from delphi_utils.weekday import Weekday
8-
from .constants import HTML_LINK, STATES, BUCKET
8+
from .constants import HTML_LINK, STATES
99
from .. import (
1010
get_structured_logger,
1111
)
@@ -30,7 +30,8 @@ def split_reporting_schedule_dfs(input_df, flash_dir, lag):
3030
rep_sched = rep_sched.drop('min_cut')
3131
glob_out_list = []
3232
non_daily_ar = []
33-
for i, df in rep_sched.groupby('0'):
33+
rep_sched.columns = ['schedule']
34+
for i, df in rep_sched.groupby('schedule'):
3435
fixed_sum = []
3536
columns = []
3637
for col in input_df.columns:
@@ -48,50 +49,53 @@ def split_reporting_schedule_dfs(input_df, flash_dir, lag):
4849
glob_out_list.append(fixed_sum_df)
4950
else:
5051
non_daily_ar.append(fixed_sum_df)
51-
return daily_df, pd.concat(non_daily_ar,axis=1) , pd.concat(glob_out_list, axis=1)
52+
return (daily_df, pd.concat(non_daily_ar,axis=1) , pd.concat(glob_out_list, axis=1))
5253

5354

54-
def bin_approach(y, yhat, pop, log=False):
55+
def bin_approach(df, log=False):
5556
"""Create test statistic.
5657
5758
Parameters
5859
----------
60+
df with columns of
5961
y: observed values for streams
6062
yhat: predicted values for streams
6163
pop: population for a region
62-
log: difference between reporting and reference date
64+
65+
log: taking the log for the test statistic measure
6366
6467
Returns
6568
-------
6669
today's test-statistic values for the stream
6770
"""
6871
def ts_dist(x, y, n):
6972
"""Initialize test statistic distribution which is then vectorized."""
70-
# print(x, y, y/n, n, binom.cdf(x, int(n), y/n)
71-
return binom.cdf(x, int(n), y/n)
73+
return binom.cdf(x, int(n), y / n)
74+
7275
vec_ts_dist = np.vectorize(ts_dist)
7376
if log:
74-
return vec_ts_dist(np.log(y+2), np.log(yhat+2), np.log(pd.Series(pop)+2))
75-
return vec_ts_dist(y, yhat, pop)
77+
return pd.DataFrame(vec_ts_dist(np.log(df.y + 2),
78+
np.log(df.yhat + 2), np.log(df['pop'] + 2)),
79+
index=df.index)
80+
return pd.DataFrame(vec_ts_dist(df.y, df.yhat, df.pop), index=df.index)
81+
7682

7783

78-
def global_outlier_detect(df, mean, var):
84+
85+
def outlier_detect(df):
7986
"""Determine global outliers by using abs(t-statistic) > 5.
8087
8188
Parameters
8289
----------
83-
df: Current df to evaluate for global outliers
84-
mean: Mean needed for t-statistic calculation
85-
var: Variance for t-statistic calculation
90+
df: Current df to evaluate for global outliers with columns
91+
for mean and var.
8692
8793
Returns
8894
-------
8995
The columns that are global outliers.
9096
"""
91-
all_columns = list(df.columns)
92-
mean = mean[all_columns]
93-
var = var[all_columns]
94-
return df.columns[((abs(df.T.iloc[:, 0].sort_index() - mean.sort_index())/var.clip(1)).gt(5))]
97+
df.columns = ['x', 'mean', 'var']
98+
return df.index[((abs(df['x'] - df['mean']) / (df['var'].clip(1))).gt(5))]
9599

96100
def apply_ar(last_7, flash_dir, lag, weekday_correction, non_daily_df, fips_pop_table):
97101
"""Predict y_hat using an AR model.
@@ -109,13 +113,17 @@ def apply_ar(last_7, flash_dir, lag, weekday_correction, non_daily_df, fips_pop_
109113
-------
110114
ts_streams: return of test statistic for the day's steams.
111115
"""
112-
lin_coeff = pd.read_csv(f'{flash_dir}/lin_coeff_{lag}.csv', index_col=0)
113116
y = pd.concat([weekday_correction, non_daily_df], axis=1)
114-
y_hat = pd.Series([np.dot(lin_coeff[x], last_7[x]) for x in y.columns])
115-
ts_streams = pd.DataFrame(bin_approach(y, y_hat,
116-
list(fips_pop_table[y.columns].iloc[0, :]), log=True),
117-
columns=y.columns)
118-
return ts_streams
117+
y.name = 'y'
118+
lin_coeff = pd.read_csv(f'{flash_dir}/lin_coeff_{lag}.csv', index_col=0)
119+
y_hat = pd.Series([np.dot(lin_coeff[x], last_7[x]) for x in y.columns], name='yhat')
120+
y_hat.index = y.columns
121+
df_for_ts = y.T.merge(y_hat, left_index=True, right_index=True).merge(fips_pop_table.T
122+
, left_index=True, right_index=True)
123+
df_for_ts.columns = ['y', 'yhat', 'pop']
124+
125+
return df_for_ts, bin_approach(df_for_ts, log=True)
126+
119127

120128
def output(evd_ranking, day, lag, signal, logger):
121129
"""Write the top streams that warrant human inspection to the log.
@@ -159,10 +167,10 @@ def evd_ranking_fn(ts_streams, flash_dir):
159167
EVD_max = pd.read_csv(f'{flash_dir}/max.csv', index_col=0)
160168
EVD_min = pd.read_csv(f'{flash_dir}/min.csv', index_col=0)
161169
evd_ranking = pd.concat(
162-
[ts_streams.apply(lambda x: sum(x.values[0] <= EVD_min['0'])
163-
/ EVD_min['0'].shape[0], axis=0).sort_values(),
164-
ts_streams.apply(lambda x: sum(x.values[0] >= EVD_max['0'])
165-
/ EVD_max['0'].shape[0], axis=0).sort_values()],
170+
[ts_streams.apply(lambda x: ts_val(x.values[0],
171+
EVD_min['0']), axis=1).sort_values(),
172+
ts_streams.apply(lambda x :1-ts_val(x.values[0],
173+
EVD_max['0']), axis=1).sort_values()],
166174
axis=1).max(axis=1)
167175
evd_ranking.name = 'evd_ranking'
168176
return evd_ranking
@@ -191,7 +199,7 @@ def streams_groups_fn(stream, ts_streams):
191199
for col, val in ts_streams.T.iterrows():
192200
if key == col[:2]:
193201
total_dist = pd.concat([group[0], streams_state]).reset_index(drop=True)
194-
ranking_streams[col] = sum(total_dist < val[0]) / total_dist.shape[0]
202+
ranking_streams[col] = ts_val(val[0], total_dist)
195203
stream_group = pd.Series(ranking_streams, name='stream_group')
196204
return stream_group
197205

@@ -215,6 +223,22 @@ def setup_fips(flash_dir):
215223
fips_pop_table.columns = [STATE_to_fips[x] if x in list(STATES)
216224
else x for x in fips_pop_table.columns.droplevel()]
217225
return STATE_to_fips, fips_pop_table
226+
227+
228+
def ts_val(val, dist):
229+
"""Determine p-value from the test statistic distribution.
230+
231+
Parameters
232+
----------
233+
val: The test statistic
234+
dist: The distribution to compare to
235+
236+
Returns: p-value
237+
-------
238+
239+
"""
240+
return sum(val <= dist) / dist.shape[0]
241+
218242
def flash_eval(lag, day, input_df, signal, params, logger=None):
219243
"""Evaluate most recent data using FlaSH.
220244
@@ -226,20 +250,20 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
226250
Ouput:
227251
None
228252
"""
229-
print("RF")
230253
if not logger:
231254
logger = get_structured_logger(
232255
name=signal,
233256
filename=params["common"].get("log_filename", None),
234257
log_exceptions=params["common"].get("log_exceptions", True))
235258

236-
#TODOv4: Change these to a local dir
259+
#TODOv4: Change these to a local dir or aws
237260
flash_dir = f'flash_ref/{signal}'
238261
last_7 = pd.read_csv(f'{flash_dir}/last_7_{lag}.csv', index_col=0).astype(float)
239262
wk_mean = pd.read_csv(f'{flash_dir}/weekday_mean_df_{lag}.csv', index_col=0)
240263
wk_var = pd.read_csv(f'{flash_dir}/weekday_var_df_{lag}.csv', index_col=0)
241264
weekday_params = pd.read_csv(f'{flash_dir}/weekday_params_{lag}.csv', index_col=0)
242265
summary_stats = pd.read_csv(f'{flash_dir}/summary_stats_{lag}.csv', index_col=0)
266+
summary_stats.index = ['0.25', 'median', '0.75', 'mean', 'var']
243267
stream = pd.read_csv(f'{flash_dir}/ret_df2_{lag}.csv', index_col=0)
244268

245269

@@ -258,10 +282,9 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
258282
daily_update_df, non_daily_df_test, non_ar_df = split_reporting_schedule_dfs(input_df,
259283
flash_dir, lag)
260284
# Weekday outlier [only for Daily Df]
261-
weekday_outlier = daily_update_df.columns[((abs(
262-
daily_update_df.T.sort_index()[day] - wk_mean.loc[day.day_of_week,
263-
daily_update_df.columns].sort_index()) /wk_var.loc[day.day_of_week,
264-
daily_update_df.columns].clip(1)).gt(5))]
285+
weekday_outlier = outlier_detect(daily_update_df.T.merge(wk_mean.loc[day.day_of_week, :],
286+
left_index=True, right_index=True).merge(wk_var.loc[day.day_of_week, :],
287+
left_index=True, right_index=True))
265288

266289
# Make weekday correction for daily update
267290
additive_factor = summary_stats[daily_update_df.columns].iloc[4, :]
@@ -273,17 +296,18 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
273296

274297
global_outlier_list = []
275298
for df in [weekday_correction, non_daily_df_test, non_ar_df]:
276-
global_outlier_list += list(
277-
global_outlier_detect(df, summary_stats[df.columns].iloc[2],
278-
summary_stats[df.columns].iloc[4]))
299+
global_outlier_list+=list(outlier_detect(df.T.merge(summary_stats[df.columns].loc['median'
300+
,:],left_index=True, right_index=True
301+
).merge(summary_stats[df.columns].loc['var',:],
302+
left_index=True, right_index=True)))
279303

280304
# Apply AR
281-
ts_streams = apply_ar(last_7, flash_dir, lag, weekday_correction,
305+
ts_streams, df_for_ts = apply_ar(last_7, flash_dir, lag, weekday_correction,
282306
non_daily_df_test, fips_pop_table)
283307
# find stream ranking (individual)
284-
stream_individual = ts_streams.apply(
285-
lambda x: sum(x.values[0] <= stream[x.name].dropna()) /
286-
stream[x.name].dropna().shape[0], axis=0)
308+
stream_individual = ts_streams.T.apply(lambda x: ts_val(x.values[0],
309+
stream[x.name].dropna()))
310+
287311
stream_individual.name = 'stream_individual'
288312

289313

@@ -308,15 +332,16 @@ def flash_eval(lag, day, input_df, signal, params, logger=None):
308332
how='outer').merge(stream_group,
309333
left_index=True, right_index=True, how='outer').merge(evd_ranking,
310334
left_index=True, right_index=True, how='outer'
311-
)
335+
).merge(df_for_ts, left_index=True,
336+
right_index=True, how='outer')
312337
#if aws parameters are passed, save this dataframe to AWS
313338
if params.get('archive', None):
314339
if params['archive'].get("aws_credentials", None):
315340
session = boto3.Session(
316341
aws_access_key_id=params['archive']['aws_credentials']["aws_access_key_id"],
317342
aws_secret_access_key=params['archive']['aws_credentials']["aws_secret_access_key"])
318343
s3 = session.resource('s3')
319-
s3.Object(BUCKET,
344+
s3.Object(params['flash']["aws_bucket"],
320345
f'flags-dev/flash_results/{signal}_{day.strftime("%m_%d_%Y")}_{lag}.csv').put(
321346
Body=type_of_outlier.to_csv(), ACL='public-read')
322347

_delphi_utils_python/delphi_utils/flash_eval/params.json.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"flash": {
3+
"aws_bucket": "{{ flash_aws_bucket_name }}",
34
"signals": ["confirmed_incidence_num"],
45
"lags": ["1"],
56
"support": ["0", "400000000"]

_delphi_utils_python/delphi_utils/flash_eval/run.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,28 @@
55
when the module is run with `python -m delphi_utils.flash_eval`.
66
"""
77
from datetime import date
8-
from collections import defaultdict
98
import pandas as pd
10-
from .. import read_params
119
from .eval_day import flash_eval
1210
from ..validator.datafetcher import read_filenames, load_csv
1311

14-
def run_module():
12+
def run_module(params):
1513
"""Run the FlaSH module.
1614
1715
The parameters dictionary must include the signals and signals.
1816
We are only considering lag-1 data.
1917
"""
20-
params = read_params()
2118
signals = params["flash"]["signals"]
2219
for signal in signals:
2320
export_files = read_filenames(params["common"]["export_dir"])
24-
days = defaultdict(list)
25-
#Concat the data from recent files at nation, state, and county resolution per day.
21+
days = {}
2622
for (x, _) in export_files:
23+
#Concat the data from recent files at nation, state, and county resolution per day.
2724
if signal in x and pd.Series([y in x for y in ['state', 'county', 'nation']]).any():
2825
day = pd.to_datetime(x.split('_')[0], format="%Y%m%d", errors='raise')
29-
days[day].append(load_csv(f"{params['common']['export_dir']}/{x}"))
30-
days[day] = pd.concat(days[day])
31-
for day, input_df in days.items():
26+
days[day] = pd.concat([days.get(day, pd.DataFrame()),
27+
load_csv(f"{params['common']['export_dir']}/{x}")])
28+
29+
for day, input_df in dict(sorted(days.items())).items():
3230
input_df = input_df[['geo_id', 'val']].set_index('geo_id').T
3331
input_df.index = [day]
3432
today = date.today()

_delphi_utils_python/delphi_utils/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def run_indicator_pipeline(indicator_fn: Callable[[Params], None],
6767
archiver = archiver_fn(params)
6868

6969
if flash_fn:
70-
t = threading.Timer(timer, flash_fn)
70+
t = threading.Timer(timer, flash_fn(params))
7171
t.start()
7272
t.join(timer)
7373
if t.is_alive():

_delphi_utils_python/tests/flash_ref/last_7_1.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_2.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_3.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_4.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_5.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_6.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

_delphi_utils_python/tests/flash_ref/last_7_7.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

jhu/params.json.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,11 @@
5050
"indicator_prefix": "jhu"
5151
},
5252
"flash": {
53+
"aws_bucket": "{{ flash_aws_bucket_name }}",
5354
"signals": ["confirmed_incidence_num"],
5455
"lags": ["1"],
5556
"support": ["0", "400000000"]
57+
5658
}
59+
5760
}

0 commit comments

Comments
 (0)