@@ -142,15 +142,30 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option):
142
142
143
143
smoothed_thresholds = raw_thresholds .apply (lambda x : x / (math .sqrt (7 ) * 1.5 ))
144
144
145
-
146
-
145
+ # Code reference from R code
146
+ # changesum.by.variable.with.flags = changesum.by.variable %>>%
147
+ # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] |
148
+ # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]],
149
+ # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>%
150
+
151
+ switcher = {
152
+ 'raw' : raw_thresholds ,
153
+ 'smoothed' : smoothed_thresholds ,
154
+ }
155
+ # Get the function from switcher dictionary
156
+ thres = switcher .get (smooth_option , lambda : "Invalid smoothing option" )
157
+
158
+ mean .stddiff .high = mean_stddiff .abs () > thres .loc ['mean.stddiff' ] or
159
+ mean_stddiff .abs () > thres .loc ['val.mean.stddiff"' ]
160
+ mean .stdabsdiff .high = mean_stdabsdiff > thres .loc ['mean.stdabsdiff' ]
161
+
162
+ if mean .stddiff .high or mean .stdabsdiff .high :
163
+ print ('Average differences in variables by geoid between recent & semirecent data seem \
164
+ large --- either large increase tending toward one direction or large mean absolute \
165
+ difference, relative to average values of corresponding variables. For the former \
166
+ check, tolerances for `val` are more restrictive than those for other columns.' ')
147
167
148
168
149
- # The daterange function is exclusive of the end_date in line with the native python range()
150
- # for check_date in daterange(start_date, end_date):
151
- # print(check_date.strftime("%Y-%m-%d"))
152
-
153
-
154
169
def fbsurvey_validation (daily_filenames , sdate , edate , max_check_lookbehind = timedelta (days = 7 ), sanity_check_rows_per_day = True , sanity_check_value_diffs = True ):
155
170
156
171
meta = covidcast .metadata ()
0 commit comments