fixup! [wip] validation package\n*replace print/exit with raised exceptions

krivard · krivard · commit 52fa47459e13 · 2020-09-08T15:11:06.000-04:00
diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py
@@ -0,0 +1,237 @@
+import sys
+import re
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from itertools import product
+from datetime import date, datetime, timedelta
+from .datafetcher import *
+import math
+
+negated_regex_dict = {
+    'county': '^(?!\d{5}).*$',
+    'hrr': '^(?!\d{1,3}).*$',
+    'msa': '^(?!\d{5}).*$',
+    'state': '^(?![A-Z]{2}).*$',
+    'national': '(?!usa).*$'
+}
+
+class ValidationError(Exception):
+    def __init__(self, expression, message):
+        self.expression = expression
+        self.message = message
+
+def make_date_filter(start_date, end_date):
+    start_code = int(start_date.strftime("%Y%m%d"))
+    end_code = int(end_date.strftime("%Y%m%d"))
+    def f(filename, match):
+        if not match: return False
+        code = int(match.groupdict()['date'])
+        return code > start_code and code < end_code
+    return f
+        
+def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True):
+    
+    # Perform some automated format and sanity checks of =df.to.test=
+    if(type(max_check_lookbehind) != int | len(str(max_check_look_behind) != 1)):
+        raise ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be length 1, integer type")
+
+    if( not isinstance(generation_date, datetime.date) or generation_date > date.today()):
+        raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a length 1 Date that is not in the future.")
+    # example: 20200624_county_smoothed_nohh_cmnty_cli
+
+    pattern_found = filename_regex.match(nameformat)
+    if (not nameformat or not pattern_found):
+        raise ValidationError(nameformat, 'nameformat ({nameformat}) not recognized')
+
+def check_bad_geo_id(df_to_test, geo_type):
+    if geo_type not in negated_regex_dict:
+        raise ValidationError(geo_type,"Unrecognized geo type")
+    
+    def find_all_unexpected_geo_ids(df_to_test, negated_regex):
+        unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0]
+        if(len(unexpected_geos) > 0):
+            raise ValidationError(unexpected_geos,"Non-conforming geo_ids exist!")
+    
+    find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type])
+
+def check_missing_dates(daily_filenames, sdate, edate):
+    number_of_dates = edate - sdate + timedelta(days=1)
+    date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)}
+    unique_dates = set()
+    unique_dates_obj = set()
+
+    for daily_filename in daily_filenames:
+        unique_dates.add(daily_filename[0:8])
+    for unique_date in unique_dates:
+        newdate_obj = datetime.strptime(unique_date, '%Y%m%d')
+        unique_dates_obj.add(newdate_obj)
+
+    check_dateholes = date_seq.difference(unique_dates_obj)
+    
+    if check_dateholes:
+        print("Missing dates are observed; if these dates are already in the API they would not be updated")
+        print(check_dateholes)
+
+def check_bad_val(df_to_test):
+    # if (not df_to_test[(df_to_test['val'] > 100)].empty):
+    #     print("val column can't have any cell greater than 100")
+    #     sys.exit()
+    if (df_to_test.isnull().values.any()):
+        raise ValidationError(None,"val column can't have any cell set to null")
+    if (not df_to_test[(df_to_test['val'] < 0)].empty):
+        raise ValidationError(None,"val column can't have any cell smaller than 0")
+
+def check_bad_se(df):
+    if (df['se'].isnull().values.any()):
+        raise ValidationError("se must not be NA")
+    
+    df.eval('se_upper_limit = (val * effective_sample_size + 50)/(effective_sample_size + 1)', inplace=True)
+
+    df['se']= df['se'].round(3)
+    df['se_upper_limit'] = df['se_upper_limit'].round(3)
+
+    result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))')
+
+    if not result.empty:
+        raise ValidationError("se must be in (0,min(50,val*(1+eps))]")
+
+def check_bad_sample_size(df):
+    if(df['sample_size'].isnull.values.any() | df['effective_sample_size'].isnull.values.any()):
+        raise ValidationError("sample size can't be NA")
+    
+    qresult = df.query('(sample_size < 100) | (effective_sample_size < 100)')
+
+    if not qresult.empty:
+        raise ValidationError("sample size must be >= 100")
+
+def check_min_allowed_max_date(generation_date, max_date, max_weighted_date):
+    if (max_weighted_date < generation_date - timedelta(days=4)
+        or max_date < generation_date - timedelta(days=1)):
+        raise ValidationError("latest date of generated file seems too long ago")
+
+def reldiff_by_min(x, y):
+    return (x - y) / min(x,y)
+
+def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo):
+    recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0]
+    recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list)
+    
+    if(abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35):
+        raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)")
+
+def check_avg_val_diffs(recent_df, recent_api_df, smooth_option):
+    #print("recent_df dtypes", recent_df.dtypes)
+    recent_df = recent_df.drop(columns=['geo_id'])
+    mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean()
+    recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean()
+    recent_api_df = recent_api_df.drop(columns=['geo_value'])
+
+    mean_recent_api_df = recent_api_df.mean()
+
+    mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean())
+    mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean())
+    #print("mean_stddiff", mean_stddiff)
+    #print("mean_stdabsdiff", mean_stdabsdiff)
+    #print("type(mean_stdabsdiff)",type(mean_stdabsdiff))
+
+    classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff']
+    raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes)
+
+    smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5))
+    
+    # Code reference from R code
+    # changesum.by.variable.with.flags = changesum.by.variable %>>%
+    #         dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] |
+    #                           variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]],
+    #                       mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>%
+    # Todo - Check whats the purpose of variable=="val" in the above statement
+
+    switcher = {
+    'raw': raw_thresholds,
+    'smoothed': smoothed_thresholds,
+    }
+    # Get the function from switcher dictionary
+    thres = switcher.get(smooth_option, lambda: "Invalid smoothing option")
+
+    #print(np.absolute(mean_stddiff) > thres.loc['mean.stddiff'])
+    mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool()
+    mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool()
+
+    
+    if mean_stddiff_high or mean_stdabsdiff_high:
+        raise ValidationError('Average differences in variables by geoid between recent & semirecent data seem' \
+              + 'large --- either large increase tending toward one direction or large mean absolute' \
+              + 'difference, relative to average values of corresponding variables.  For the former' \
+              + 'check, tolerances for `val` are more restrictive than those for other columns.')
+
+def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True):
+
+    export_files = read_filenames(export_dir)
+    date_filter = make_date_filter(start_date, end_date)
+    validate_files = [(f, m) for (f, m) in export_files if date_filter(f,m)]
+
+    all_frames = []
+    
+    # First, check file formats
+    check_missing_dates(validate_files, start_date, end_date)
+    for filename,match in validate_files:
+        df = load_csv(filename)
+        check_bad_geo_id(df, match.groupdict()['geo_type'])
+        check_bad_val(df)
+        check_bad_se(df)
+        check_bad_sample_size(df)
+        df['geo_type'] = match.groupdict()['geo_type']
+        df['date'] = match.groupdict()['date']
+        df['signal'] = match.groupdict()['signal']
+        all_frames.append(df)    
+
+    # Multi-indexed dataframe for a given (signal, geo_type)
+
+    ## recent_lookbehind: start from the check date and working backward in time,
+    ## how many days do we include in the window of date to check for anomalies?
+    ## Choosing 1 day checks just the check data itself.
+    recent_lookbehind = timedelta(days=1)
+
+    ## semirecent_lookbehind: starting from the check date and working backward
+    ## in time, how many days -- before subtracting out the "recent" days ---
+    ## do we use to form the reference statistics?
+    semirecent_lookbehind = timedelta(days=7)
+    smooth_option_regex = re.compile(r'([^_]+)')
+
+    kroc = 0
+    for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist):
+        
+        m = smooth_option_regex.match(sig)
+        smooth_option = m.group(1)
+        
+        #recent_df.set_index("time_value", inplace = True)
+        print("Printing recent_df scenes:", recent_df.shape)
+        print(recent_df)
+        for checking_date in date_list:
+            #print(recent_df.loc[checking_date,:])
+            # -recent- dataframe run backwards from the checking_date
+            recent_end_date = checking_date - timedelta(days=1)
+            recent_begin_date = checking_date - max_check_lookbehind
+            recent_api_df = covidcast.signal(DATA_SOURCE, sig, recent_begin_date, recent_end_date, geo)
+            
+            recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True)
+            recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True)
+            
+            column_names = ["geo_value", "val", "se", "sample_size", "time_value"]
+
+            recent_api_df = recent_api_df.reindex(columns=column_names)
+            if (recent_df["se"].isnull().mean() > 0.5):
+                print('Recent se values are >50% NA')
+
+            if sanity_check_rows_per_day:
+                check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo)
+
+            if sanity_check_value_diffs:
+                check_avg_val_diffs(recent_df, recent_api_df, smooth_option)
+        kroc += 1
+        if kroc == 2:  
+            break
+    sys.exit()
+    
+