Skip to content

Commit 5e60926

Browse files
committed
Average diff check correction
1 parent 249038c commit 5e60926

File tree

1 file changed

+26
-7
lines changed

1 file changed

+26
-7
lines changed

validator/delphi_validator/fbsurveyvalidation.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,21 +114,36 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date):
114114
def reldiff_by_min(x, y):
115115
return (x - y) / min(x,y)
116116

117-
def check_rapid_change(recent_df, recent_api_df, date_list):
117+
def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo):
118118
recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0]
119119
recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list)
120120

121-
if(sanity_check_rows_per_day and abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35):
121+
if(abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35):
122122
print("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)")
123123
print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo)
124124

125125

126+
def check_avg_val_diffs(recent_df, recent_api_df):
127+
print("recent_df dtypes", recent_df.dtypes)
128+
recent_df = recent_df.drop(columns=['geo_id'])
129+
mean_recent_df = recent_df.mean()
130+
recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean()
131+
recent_api_df = recent_api_df.drop(columns=['geo_value'])
132+
mean_recent_api_df = recent_api_df.mean()
133+
134+
#mean.stddiff = (mean(recent-semirecent)*2/(mean(recent)+mean(semirecent)))
135+
mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean())
136+
mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean())
137+
print("mean_stddiff", mean_stddiff)
138+
print("mean_stdabsdiff", mean_stdabsdiff)
139+
140+
126141
# The daterange function is exclusive of the end_date in line with the native python range()
127-
for check_date in daterange(start_date, end_date):
128-
print(check_date.strftime("%Y-%m-%d"))
142+
# for check_date in daterange(start_date, end_date):
143+
# print(check_date.strftime("%Y-%m-%d"))
129144

130145

131-
def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True):
146+
def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True):
132147

133148
meta = covidcast.metadata()
134149
fb_meta = meta[meta['data_source']==DATA_SOURCE]
@@ -162,7 +177,7 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti
162177
date_slist = [dt.strftime("%Y%m%d") for dt in date_list]
163178
print(date_slist)
164179

165-
data_folder = Path("data/")
180+
data_folder = Path("../data")
166181

167182
filenames = read_relevant_date_filenames(data_folder, date_slist)
168183

@@ -200,7 +215,11 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti
200215
if (recent_df["se"].isnull().mean() > 0.5):
201216
print('Recent se values are >50% NA')
202217

203-
check_rapid_change(recent_df, recent_api_df, date_list)
218+
#if sanity_check_rows_per_day:
219+
# check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo)
220+
221+
if sanity_check_value_diffs:
222+
check_avg_val_diffs(recent_df, recent_api_df)
204223
kroc += 1
205224
if kroc == 2:
206225
break

0 commit comments

Comments
 (0)