@@ -114,21 +114,36 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date):
114
114
def reldiff_by_min (x , y ):
115
115
return (x - y ) / min (x ,y )
116
116
117
- def check_rapid_change (recent_df , recent_api_df , date_list ):
117
+ def check_rapid_change (checking_date , recent_df , recent_api_df , date_list , sig , geo ):
118
118
recent_rows_per_reporting_day = recent_df [recent_df ['time_value' ] == checking_date ].shape [0 ]
119
119
recent_api_rows_per_reporting_day = recent_api_df .shape [0 ] / len (date_list )
120
120
121
- if (sanity_check_rows_per_day and abs (reldiff_by_min (recent_rows_per_reporting_day , recent_api_rows_per_reporting_day )) > 0.35 ):
121
+ if (abs (reldiff_by_min (recent_rows_per_reporting_day , recent_api_rows_per_reporting_day )) > 0.35 ):
122
122
print ("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)" )
123
123
print ("The suspicous spike is for date: " , checking_date , ", signal: " , sig , ", geo_type: " , geo )
124
124
125
125
126
+ def check_avg_val_diffs (recent_df , recent_api_df ):
127
+ print ("recent_df dtypes" , recent_df .dtypes )
128
+ recent_df = recent_df .drop (columns = ['geo_id' ])
129
+ mean_recent_df = recent_df .mean ()
130
+ recent_api_df = recent_api_df .groupby (['geo_value' ], as_index = False )[['val' , 'se' , 'sample_size' ]].mean ()
131
+ recent_api_df = recent_api_df .drop (columns = ['geo_value' ])
132
+ mean_recent_api_df = recent_api_df .mean ()
133
+
134
+ #mean.stddiff = (mean(recent-semirecent)*2/(mean(recent)+mean(semirecent)))
135
+ mean_stddiff = ((mean_recent_df - mean_recent_api_df ).mean () * 2 ) / (mean_recent_df .mean () + mean_recent_api_df .mean ())
136
+ mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df ).abs ().mean () * 2 ) / (mean_recent_df .mean () + mean_recent_api_df .mean ())
137
+ print ("mean_stddiff" , mean_stddiff )
138
+ print ("mean_stdabsdiff" , mean_stdabsdiff )
139
+
140
+
126
141
# The daterange function is exclusive of the end_date in line with the native python range()
127
- for check_date in daterange (start_date , end_date ):
128
- print (check_date .strftime ("%Y-%m-%d" ))
142
+ # for check_date in daterange(start_date, end_date):
143
+ # print(check_date.strftime("%Y-%m-%d"))
129
144
130
145
131
- def fbsurvey_validation (daily_filenames , sdate , edate , max_check_lookbehind = timedelta (days = 7 ), sanity_check_rows_per_day = True ):
146
+ def fbsurvey_validation (daily_filenames , sdate , edate , max_check_lookbehind = timedelta (days = 7 ), sanity_check_rows_per_day = True , sanity_check_value_diffs = True ):
132
147
133
148
meta = covidcast .metadata ()
134
149
fb_meta = meta [meta ['data_source' ]== DATA_SOURCE ]
@@ -162,7 +177,7 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti
162
177
date_slist = [dt .strftime ("%Y%m%d" ) for dt in date_list ]
163
178
print (date_slist )
164
179
165
- data_folder = Path ("data/ " )
180
+ data_folder = Path ("../data " )
166
181
167
182
filenames = read_relevant_date_filenames (data_folder , date_slist )
168
183
@@ -200,7 +215,11 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti
200
215
if (recent_df ["se" ].isnull ().mean () > 0.5 ):
201
216
print ('Recent se values are >50% NA' )
202
217
203
- check_rapid_change (recent_df , recent_api_df , date_list )
218
+ #if sanity_check_rows_per_day:
219
+ # check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo)
220
+
221
+ if sanity_check_value_diffs :
222
+ check_avg_val_diffs (recent_df , recent_api_df )
204
223
kroc += 1
205
224
if kroc == 2 :
206
225
break
0 commit comments