From 088f8aa0bcc86d9f3ee36511d57fa9f2f6a8e349 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 16 Jul 2020 12:21:13 -0400 Subject: [PATCH 001/151] Initial commit for the validation --- validator/driver.py | 16 ++++++++ validator/fbsurveyvalidation.py | 67 +++++++++++++++++++++++++++++++++ validator/getdata.py | 62 ++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100644 validator/driver.py create mode 100644 validator/fbsurveyvalidation.py create mode 100644 validator/getdata.py diff --git a/validator/driver.py b/validator/driver.py new file mode 100644 index 000000000..302ef4247 --- /dev/null +++ b/validator/driver.py @@ -0,0 +1,16 @@ +from fbsurveyvalidation import * +from getdata import * + +# Defining start date and end date for the last fb-survey pipeline execution +survey_sdate = "2020-06-13" +survey_edate = "2020-06-20" +dtobj_sdate = datetime.strptime(survey_sdate, '%Y-%m-%d') +dtobj_edate = datetime.strptime(survey_edate, '%Y-%m-%d') +print(dtobj_sdate.date()) +print(dtobj_edate.date()) + + +# Collecting all filenames +daily_filnames = read_filenames("./data") + +fbsurvey_validation(daily_filnames, dtobj_sdate, dtobj_edate) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py new file mode 100644 index 000000000..a9b214c1c --- /dev/null +++ b/validator/fbsurveyvalidation.py @@ -0,0 +1,67 @@ +import sys +import re +import pandas as pd +from datetime import date + + +#def validate_daily(df_to_test, nameformat, covidcast_reference_dfs, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working): +def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): + + # Perform some automated format and sanity checks of =df.to.test= + if(type(max_check_lookbehind) != int | len(str(max_check_look_behind) != 1)): + sys.exit(" =max_check_lookbehind= must be length 1, integer type") + + if( not isinstance(generation_date, datetime.date) or generation_date > date.today()): + sys.exit("=generation.date= must be a length 1 Date that is not in the future.") + # example: 20200624_county_smoothed_nohh_cmnty_cli + filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw|smoothed)_(\w*)([ci]li).csv$') + pattern_found = filename_regex.match(nameformat) + if (not nameformat or not pattern_found): + sys.exit('=nameformat= not recognized as a daily format') + + + +def main(): + print("Inside main") + df_to_test = pd.read_csv( + "data/20200613_county_raw_cli.csv", + dtype={'geo_id': str, 'val': float, 'se': float, 'sample_size': float, 'effective_sample_size': float + }) + + print(df_to_test.head()) + print(df_to_test.describe()) + + result = df_to_test.dtypes + print(result) + + sys.exit() + + #validate_daily(df_to_test, nameformat, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working) + print(date.today()) + +def check_missing_dates(daily_filenames, sdate, edate): + number_of_dates = edate - sdate + timedelta(days=1) + #print(number_of_dates) + + date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days + 1)} + #print(date_seq) + + unique_dates = set() + unique_dates_obj = set() + + for daily_filename in daily_filenames: + unique_dates.add(daily_filename[0:8]) + + for unique_date in unique_dates: + newdate_obj = datetime.strptime(unique_date, '%Y%m%d') + unique_dates_obj.add(newdate_obj) + + check_dateholes = date_seq.difference(unique_dates_obj) + + if check_dateholes: + print("Missing dates are observed; if these dates are already in the API they would not be updated") + print(check_dateholes) + +def fbsurvey_validation(daily_filnames): + + check_missing_dates(daily_filenames, ) \ No newline at end of file diff --git a/validator/getdata.py b/validator/getdata.py new file mode 100644 index 000000000..85c81f497 --- /dev/null +++ b/validator/getdata.py @@ -0,0 +1,62 @@ +from os import listdir, stat +from os.path import isfile, join +import platform +import covidcast +from datetime import date, datetime, timedelta + + +def read_filenames(path): + daily_filenames = [f for f in listdir(path) if isfile(join(path, f))] + return daily_filenames + + +def new_stuff(): + survey_sdate = "2020-06-13" + survey_edate = "2020-06-20" + dtobj_sdate = datetime.strptime(survey_sdate, '%Y-%m-%d') + dtobj_edate = datetime.strptime(survey_edate, '%Y-%m-%d') + print(dtobj_sdate.date()) + print(dtobj_edate.date()) + + number_of_dates = dtobj_edate - dtobj_sdate + timedelta(days=1) + print(number_of_dates) + + date_seq = {dtobj_sdate + timedelta(days=x) for x in range(number_of_dates.days + 1)} + print(date_seq) + + # 1) Lets first fetch all daily filenames + + + data = covidcast.signal("fb-survey", "raw_ili", date(2020, 6, 19), date(2020, 6, 19), + "state") + #print(data) + + + unique_dates = set() + unique_dates_obj = set() + + for daily_filename in daily_filenames: + unique_dates.add(daily_filename[0:8]) + + for unique_date in unique_dates: + newdate_obj = datetime.strptime(unique_date, '%Y%m%d') + unique_dates_obj.add(newdate_obj) + + check_dateholes = date_seq.difference(unique_dates_obj) + if check_dateholes: + print("Date holes exist!") + print(check_dateholes) + + + + + + +#print(data) +#print(data.dtypes) + +#print(type(data)) + +#meta = covidcast.metadata() +#meta.to_csv('meta_out.csv') +#print(meta) \ No newline at end of file From 284ac9d51eebcc98ec7f60de196bb765b1dbd84b Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 16 Jul 2020 13:45:25 -0400 Subject: [PATCH 002/151] Correcting regular expression for filenames --- validator/fbsurveyvalidation.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index a9b214c1c..5475414cc 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -62,6 +62,31 @@ def check_missing_dates(daily_filenames, sdate, edate): print("Missing dates are observed; if these dates are already in the API they would not be updated") print(check_dateholes) -def fbsurvey_validation(daily_filnames): +def fbsurvey_validation(daily_filnames, sdate, edate): - check_missing_dates(daily_filenames, ) \ No newline at end of file + check_missing_dates(daily_filenames, sdate, edate) + + # Examples: + # raw_cli + # raw_ili + # raw_wcli + # raw_wili + # raw_hh_cmnty_cli + # raw_nohh_cmnty_cli + filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw|smoothed)_(\w*)([ci]li).csv$') + for f in daily_filnames: + # example: 20200624_county_smoothed_nohh_cmnty_cli + + m = filename_regex.match(f) + geo_type = m.group(2) + + if m.group(4): + signal = "_".join([m.group(3), m.group(4), m.group(5)]) + else: + signal = "_".join([m.group(3), m.group(5)]) + + if (not nameformat or not pattern_found): + sys.exit('=nameformat= not recognized as a daily format') + + + fetch_daily_data \ No newline at end of file From ebd93a3717beeaec5be1eb0ee69c107856a906f0 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 16 Jul 2020 17:53:24 -0400 Subject: [PATCH 003/151] Breaking down nested dataframes for complete date range for more streamlined daily dataframe fetch --- validator/{getdata.py => datafetcher.py} | 5 +++++ validator/driver.py | 2 +- validator/fbsurveyvalidation.py | 8 ++++---- 3 files changed, 10 insertions(+), 5 deletions(-) rename validator/{getdata.py => datafetcher.py} (88%) diff --git a/validator/getdata.py b/validator/datafetcher.py similarity index 88% rename from validator/getdata.py rename to validator/datafetcher.py index 85c81f497..db5a4633e 100644 --- a/validator/getdata.py +++ b/validator/datafetcher.py @@ -10,6 +10,11 @@ def read_filenames(path): return daily_filenames +def fetch_daily_data(data_source, survey_date, geo_type, signal): + data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_day, geo_type) + return data_to_validate + + def new_stuff(): survey_sdate = "2020-06-13" survey_edate = "2020-06-20" diff --git a/validator/driver.py b/validator/driver.py index 302ef4247..cef2362de 100644 --- a/validator/driver.py +++ b/validator/driver.py @@ -1,5 +1,5 @@ from fbsurveyvalidation import * -from getdata import * +from datafetcher import * # Defining start date and end date for the last fb-survey pipeline execution survey_sdate = "2020-06-13" diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 5475414cc..ad6f00765 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -3,6 +3,7 @@ import pandas as pd from datetime import date +DATA_SOURCE = "fb-survey" #def validate_daily(df_to_test, nameformat, covidcast_reference_dfs, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working): def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): @@ -75,9 +76,9 @@ def fbsurvey_validation(daily_filnames, sdate, edate): # raw_nohh_cmnty_cli filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw|smoothed)_(\w*)([ci]li).csv$') for f in daily_filnames: - # example: 20200624_county_smoothed_nohh_cmnty_cli - + # example: 20200624_county_smoothed_nohh_cmnty_cli m = filename_regex.match(f) + survey_date = datetime.strptime(m.group(1), '%Y%m%d').date() geo_type = m.group(2) if m.group(4): @@ -88,5 +89,4 @@ def fbsurvey_validation(daily_filnames, sdate, edate): if (not nameformat or not pattern_found): sys.exit('=nameformat= not recognized as a daily format') - - fetch_daily_data \ No newline at end of file + df_to_validate = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) From 6babdba602bf0f655739deb25d57aef7fb473900 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Mon, 20 Jul 2020 07:25:39 -0400 Subject: [PATCH 004/151] Minor regex change for filename capture. --- validator/datafetcher.py | 2 +- validator/fbsurveyvalidation.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/validator/datafetcher.py b/validator/datafetcher.py index db5a4633e..3448087c9 100644 --- a/validator/datafetcher.py +++ b/validator/datafetcher.py @@ -11,7 +11,7 @@ def read_filenames(path): def fetch_daily_data(data_source, survey_date, geo_type, signal): - data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_day, geo_type) + data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) return data_to_validate diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index ad6f00765..d5d46afeb 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -1,7 +1,8 @@ import sys import re import pandas as pd -from datetime import date +from datetime import date, datetime, timedelta +from datafetcher import * DATA_SOURCE = "fb-survey" @@ -42,17 +43,13 @@ def main(): def check_missing_dates(daily_filenames, sdate, edate): number_of_dates = edate - sdate + timedelta(days=1) - #print(number_of_dates) - date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days + 1)} - #print(date_seq) unique_dates = set() unique_dates_obj = set() for daily_filename in daily_filenames: unique_dates.add(daily_filename[0:8]) - for unique_date in unique_dates: newdate_obj = datetime.strptime(unique_date, '%Y%m%d') unique_dates_obj.add(newdate_obj) @@ -62,8 +59,10 @@ def check_missing_dates(daily_filenames, sdate, edate): if check_dateholes: print("Missing dates are observed; if these dates are already in the API they would not be updated") print(check_dateholes) + + return -def fbsurvey_validation(daily_filnames, sdate, edate): +def fbsurvey_validation(daily_filenames, sdate, edate): check_missing_dates(daily_filenames, sdate, edate) @@ -74,19 +73,26 @@ def fbsurvey_validation(daily_filnames, sdate, edate): # raw_wili # raw_hh_cmnty_cli # raw_nohh_cmnty_cli - filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw|smoothed)_(\w*)([ci]li).csv$') - for f in daily_filnames: + filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$') + for f in daily_filenames: # example: 20200624_county_smoothed_nohh_cmnty_cli + print("Printing filename") + print(f) m = filename_regex.match(f) survey_date = datetime.strptime(m.group(1), '%Y%m%d').date() geo_type = m.group(2) if m.group(4): - signal = "_".join([m.group(3), m.group(4), m.group(5)]) + signal = "".join([m.group(4), m.group(5)]) + signal = "_".join([m.group(3), signal]) else: signal = "_".join([m.group(3), m.group(5)]) - if (not nameformat or not pattern_found): + if (not m.group(0)): sys.exit('=nameformat= not recognized as a daily format') df_to_validate = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) + + print(df_to_validate) + + break \ No newline at end of file From 0b28b63fe962aba0b8d217bc0c8e46c245d4dc94 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Mon, 20 Jul 2020 09:17:01 -0400 Subject: [PATCH 005/151] Adding custom exception files for readability --- validator/datafetcher.py | 9 ++++++++- validator/errors.py | 14 ++++++++++++++ validator/fbsurveyvalidation.py | 15 ++++++++++----- 3 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 validator/errors.py diff --git a/validator/datafetcher.py b/validator/datafetcher.py index 3448087c9..e3d01f6f6 100644 --- a/validator/datafetcher.py +++ b/validator/datafetcher.py @@ -2,7 +2,9 @@ from os.path import isfile, join import platform import covidcast +import pandas as pd from datetime import date, datetime, timedelta +from errors import * def read_filenames(path): @@ -12,6 +14,12 @@ def read_filenames(path): def fetch_daily_data(data_source, survey_date, geo_type, signal): data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) + if not isinstance(data_to_validate, pd.DataFrame): + custom_msg = "Error fetching data on" + str(survey_date)+ \ + "for data source:" + data_source + \ + ", signal-type:"+ signal + \ + ", geography-type:" + geo_type + raise APIDataFetchError(custom_msg) return data_to_validate @@ -34,7 +42,6 @@ def new_stuff(): data = covidcast.signal("fb-survey", "raw_ili", date(2020, 6, 19), date(2020, 6, 19), "state") - #print(data) unique_dates = set() diff --git a/validator/errors.py b/validator/errors.py new file mode 100644 index 000000000..2e31a9528 --- /dev/null +++ b/validator/errors.py @@ -0,0 +1,14 @@ +class APIDataFetchError(Exception): + """Exception raised for errors during validation. + + Attributes: + custom_msg -- parameters which caused the error + api_msg -- explanation of the error + """ + + def __init__(self, custom_msg): + self.custom_msg = custom_msg + super().__init__(self.custom_msg) + + def __str__(self): + return '{}'.format(self.custom_msg) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index d5d46afeb..c5f7088cd 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -73,11 +73,10 @@ def fbsurvey_validation(daily_filenames, sdate, edate): # raw_wili # raw_hh_cmnty_cli # raw_nohh_cmnty_cli + i = 1 filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$') for f in daily_filenames: # example: 20200624_county_smoothed_nohh_cmnty_cli - print("Printing filename") - print(f) m = filename_regex.match(f) survey_date = datetime.strptime(m.group(1), '%Y%m%d').date() geo_type = m.group(2) @@ -90,9 +89,15 @@ def fbsurvey_validation(daily_filenames, sdate, edate): if (not m.group(0)): sys.exit('=nameformat= not recognized as a daily format') - - df_to_validate = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) + try: + df_to_validate = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) + except APIDataFetchError as e: + print("APIDataFetchError:", e) + print("\n") + print(df_to_validate) - break \ No newline at end of file + i += 1 + if i == 16: + break \ No newline at end of file From 40a2c4de2087c4550f7d5cb3847e3d1267e3c79b Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Mon, 20 Jul 2020 13:53:25 -0400 Subject: [PATCH 006/151] Minor changes --- validator/errors.py | 1 - validator/fbsurveyvalidation.py | 45 +++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/validator/errors.py b/validator/errors.py index 2e31a9528..932b1c1f3 100644 --- a/validator/errors.py +++ b/validator/errors.py @@ -3,7 +3,6 @@ class APIDataFetchError(Exception): Attributes: custom_msg -- parameters which caused the error - api_msg -- explanation of the error """ def __init__(self, custom_msg): diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index c5f7088cd..76ebea025 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -41,10 +41,26 @@ def main(): #validate_daily(df_to_test, nameformat, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working) print(date.today()) +def check_bad_geo_id(df_to_test, signal): + df[df['geo_id'].str.count('^*')>0] + switch(signal) { + case 'msa': df_to_test[df_to_test['geo_id'].str.count('\d{5}') != 0] + break; + case 'county': df_to_test[df_to_test['geo_id'].str.count('\d{5}') != 0] + break; + case 'state': df_to_test[df_to_test['geo_id'].str.count('[A-Z]{2}') != 0] + break; + case 'hrr': df_to_test[df_to_test['geo_id'].str.count('\d{1,3}') != 0] + break; + case 'national': df_to_test[df_to_test['geo_id'].str.count('\d{5}') != 0] + break; + default: sys.exit("Unknown geo type.") + + } + def check_missing_dates(daily_filenames, sdate, edate): number_of_dates = edate - sdate + timedelta(days=1) - date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days + 1)} - + date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} unique_dates = set() unique_dates_obj = set() @@ -61,6 +77,12 @@ def check_missing_dates(daily_filenames, sdate, edate): print(check_dateholes) return +def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): + if (max_weighted_date < generation_date - timedelta(days=4) + or max_date < generation_date - timedelta(days=1)): + sys.exit("latest date of generated file seems too long ago") + return + def fbsurvey_validation(daily_filenames, sdate, edate): @@ -84,13 +106,22 @@ def fbsurvey_validation(daily_filenames, sdate, edate): if m.group(4): signal = "".join([m.group(4), m.group(5)]) signal = "_".join([m.group(3), signal]) + max_weighted_date = survey_date else: signal = "_".join([m.group(3), m.group(5)]) + max_date = survey_date if (not m.group(0)): sys.exit('=nameformat= not recognized as a daily format') + + df_to_test = pd.read_csv( + "data/20200613_county_raw_cli.csv", + dtype={'geo_id': str, 'val': float, 'se': float, 'sample_size': float, 'effective_sample_size': float + }) + + try: - df_to_validate = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) + df_ref = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) except APIDataFetchError as e: print("APIDataFetchError:", e) print("\n") @@ -99,5 +130,9 @@ def fbsurvey_validation(daily_filenames, sdate, edate): print(df_to_validate) i += 1 - if i == 16: - break \ No newline at end of file + if i == 2: + break + + check_min_allowed_max_date(generation_date, max_date, max_weighted_date) + check_bad_geo_id(df_to_test, signal) + From 77be3f0fa119c8b57eb19c0ea601cdd7f4c7321c Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 23 Jul 2020 16:19:34 -0400 Subject: [PATCH 007/151] Geotype validation logic correction --- validator/fbsurveyvalidation.py | 61 ++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 76ebea025..54a22beff 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -1,6 +1,7 @@ import sys import re import pandas as pd +from pathlib import Path from datetime import date, datetime, timedelta from datafetcher import * @@ -27,7 +28,11 @@ def main(): print("Inside main") df_to_test = pd.read_csv( "data/20200613_county_raw_cli.csv", - dtype={'geo_id': str, 'val': float, 'se': float, 'sample_size': float, 'effective_sample_size': float + dtype={'geo_id': str, + 'val': float, + 'se': float, + 'sample_size': float, + 'effective_sample_size': float }) print(df_to_test.head()) @@ -41,23 +46,30 @@ def main(): #validate_daily(df_to_test, nameformat, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working) print(date.today()) -def check_bad_geo_id(df_to_test, signal): - df[df['geo_id'].str.count('^*')>0] - switch(signal) { - case 'msa': df_to_test[df_to_test['geo_id'].str.count('\d{5}') != 0] - break; - case 'county': df_to_test[df_to_test['geo_id'].str.count('\d{5}') != 0] - break; - case 'state': df_to_test[df_to_test['geo_id'].str.count('[A-Z]{2}') != 0] - break; - case 'hrr': df_to_test[df_to_test['geo_id'].str.count('\d{1,3}') != 0] - break; - case 'national': df_to_test[df_to_test['geo_id'].str.count('\d{5}') != 0] - break; - default: sys.exit("Unknown geo type.") - +def check_bad_geo_id(df_to_test, geo_type): + if geo_type not in negated_regex_dict: + print("Unrecognized geo type:", geo_type ) + sys.exit() + + def find_all_unexpected_geo_ids(df_to_test, negated_regex): + unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] + if(len(unexpected_geos) > 0): + print("Non-conforming geo_ids exist!") + print(unexpected_geos) + sys.exit() + + negated_regex_dict = { + 'county': '^(?!\d{5}).*$', + 'hrr': '^(?!\d{1,3}).*$', + 'msa': '^(?!\d{5}).*$', + 'state': '^(?![A-Z]{2}).*$' + 'national': '(?!usa).*$' } + find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) + + + def check_missing_dates(daily_filenames, sdate, edate): number_of_dates = edate - sdate + timedelta(days=1) date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} @@ -86,6 +98,8 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): def fbsurvey_validation(daily_filenames, sdate, edate): + data_folder = Path("data/") + check_missing_dates(daily_filenames, sdate, edate) # Examples: @@ -99,6 +113,13 @@ def fbsurvey_validation(daily_filenames, sdate, edate): filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$') for f in daily_filenames: # example: 20200624_county_smoothed_nohh_cmnty_cli + + df_to_test = pd.read_csv( + data_folder / f, + dtype={'geo_id': str, 'val': float, 'se': float, 'sample_size': float, 'effective_sample_size': float + }) + + m = filename_regex.match(f) survey_date = datetime.strptime(m.group(1), '%Y%m%d').date() geo_type = m.group(2) @@ -114,12 +135,6 @@ def fbsurvey_validation(daily_filenames, sdate, edate): if (not m.group(0)): sys.exit('=nameformat= not recognized as a daily format') - df_to_test = pd.read_csv( - "data/20200613_county_raw_cli.csv", - dtype={'geo_id': str, 'val': float, 'se': float, 'sample_size': float, 'effective_sample_size': float - }) - - try: df_ref = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) except APIDataFetchError as e: @@ -134,5 +149,5 @@ def fbsurvey_validation(daily_filenames, sdate, edate): break check_min_allowed_max_date(generation_date, max_date, max_weighted_date) - check_bad_geo_id(df_to_test, signal) + check_bad_geo_id(df_to_test, geo_type) From e398c8866ae63cc2fb19123c2cf1cbe31d71f8c7 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 23 Jul 2020 19:01:48 -0400 Subject: [PATCH 008/151] Modifying val check --- validator/fbsurveyvalidation.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 54a22beff..48df9f4a0 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -68,8 +68,6 @@ def find_all_unexpected_geo_ids(df_to_test, negated_regex): find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - - def check_missing_dates(daily_filenames, sdate, edate): number_of_dates = edate - sdate + timedelta(days=1) date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} @@ -89,12 +87,24 @@ def check_missing_dates(daily_filenames, sdate, edate): print(check_dateholes) return + +def check_bad_val(df_to_test): + if (df_to_test[(df_to_test['val'] > 100)].empty): + print("val column can't have any cell greater than 100") + sys.exit() + if (df_to_test.isnull().values.any()): + print("val column can't have any cell set to null") + sys.exit() + if (df_to_test[(df_to_test['val'] < 0)].empty): + print("val column can't have any cell smaller than 0") + sys.exit() + def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): if (max_weighted_date < generation_date - timedelta(days=4) or max_date < generation_date - timedelta(days=1)): sys.exit("latest date of generated file seems too long ago") return - + def fbsurvey_validation(daily_filenames, sdate, edate): @@ -150,4 +160,5 @@ def fbsurvey_validation(daily_filenames, sdate, edate): check_min_allowed_max_date(generation_date, max_date, max_weighted_date) check_bad_geo_id(df_to_test, geo_type) + check_bad_val(df_to_test) From 449c09f9911299ace0eb9e19a98d72597e0684cc Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Fri, 24 Jul 2020 03:30:07 -0400 Subject: [PATCH 009/151] Handling incorrect floating point se check --- validator/fbsurveyvalidation.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 48df9f4a0..151c61138 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -89,16 +89,32 @@ def check_missing_dates(daily_filenames, sdate, edate): return def check_bad_val(df_to_test): - if (df_to_test[(df_to_test['val'] > 100)].empty): + if (not df_to_test[(df_to_test['val'] > 100)].empty): print("val column can't have any cell greater than 100") sys.exit() if (df_to_test.isnull().values.any()): print("val column can't have any cell set to null") sys.exit() - if (df_to_test[(df_to_test['val'] < 0)].empty): + if (not df_to_test[(df_to_test['val'] < 0)].empty): print("val column can't have any cell smaller than 0") sys.exit() +def check_bad_se(df): + if (df['se'].isnull().values.any()): + print("se must not be NA") + sys.exit() + + df.eval('se_upper_limit = (val * effective_sample_size + 50)/(effective_sample_size + 1)', inplace=True) + + df['se']= df['se'].round(6) + df['se_upper_limit'] = df['se_upper_limit'].apply(np.ceil) + + result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + + if not result.empty: + print("se must be in (0,min(50,val*(1+eps))]") + sys.exit() + def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): if (max_weighted_date < generation_date - timedelta(days=4) or max_date < generation_date - timedelta(days=1)): @@ -161,4 +177,5 @@ def fbsurvey_validation(daily_filenames, sdate, edate): check_min_allowed_max_date(generation_date, max_date, max_weighted_date) check_bad_geo_id(df_to_test, geo_type) check_bad_val(df_to_test) + check_bad_se(df_to_test) From 0161c47948d1f4097fbc4fa13cbfa8adec1f7169 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Fri, 24 Jul 2020 03:31:29 -0400 Subject: [PATCH 010/151] Adding numpy dependency --- validator/fbsurveyvalidation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 151c61138..b6089ef32 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -1,6 +1,7 @@ import sys import re import pandas as pd +import numpy as np from pathlib import Path from datetime import date, datetime, timedelta from datafetcher import * From a3070860f3f86b0c8caf579e9cf015a47f40ea30 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Fri, 24 Jul 2020 03:44:12 -0400 Subject: [PATCH 011/151] Removing numpy ceiling func --- validator/fbsurveyvalidation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index b6089ef32..b8670f8a5 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -1,7 +1,6 @@ import sys import re import pandas as pd -import numpy as np from pathlib import Path from datetime import date, datetime, timedelta from datafetcher import * @@ -107,8 +106,8 @@ def check_bad_se(df): df.eval('se_upper_limit = (val * effective_sample_size + 50)/(effective_sample_size + 1)', inplace=True) - df['se']= df['se'].round(6) - df['se_upper_limit'] = df['se_upper_limit'].apply(np.ceil) + df['se']= df['se'].round(3) + df['se_upper_limit'] = df['se_upper_limit'].round(3) result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') From a7d7a58496b2136ed2d1e8f94f325e0c742058d7 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Fri, 24 Jul 2020 12:12:56 -0400 Subject: [PATCH 012/151] sample size filtering --- validator/fbsurveyvalidation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index b8670f8a5..006c5a9be 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -115,6 +115,18 @@ def check_bad_se(df): print("se must be in (0,min(50,val*(1+eps))]") sys.exit() +def check_bad_sample_size(df): + if(df['sample_size'].isnull.values.any() | df['effective_sample_size'].isnull.values.any()): + print("sample size can't be NA") + sys.exit() + + qresult = df.query('(sample_size < 100) | (effective_sample_size < 100)') + + if not qresult.empty: + print("sample size must be >= 100") + sys.exit() + + def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): if (max_weighted_date < generation_date - timedelta(days=4) or max_date < generation_date - timedelta(days=1)): @@ -178,4 +190,5 @@ def fbsurvey_validation(daily_filenames, sdate, edate): check_bad_geo_id(df_to_test, geo_type) check_bad_val(df_to_test) check_bad_se(df_to_test) + check_bad_sample_size(df_to_test) From 45fbb0eeedf16ec794a0c1a1006cd83753198526 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 30 Jul 2020 02:02:48 -0400 Subject: [PATCH 013/151] Reading signals and geo_types from metadata() --- validator/datafetcher.py | 32 ++++++++++++++++++++ validator/driver.py | 2 +- validator/fbsurveyvalidation.py | 53 ++++++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/validator/datafetcher.py b/validator/datafetcher.py index e3d01f6f6..bd024098d 100644 --- a/validator/datafetcher.py +++ b/validator/datafetcher.py @@ -11,6 +11,38 @@ def read_filenames(path): daily_filenames = [f for f in listdir(path) if isfile(join(path, f))] return daily_filenames +def read_relevant_date_filenames(data_path, date_slist): + all_files = listdir(data_path) + filenames = list() + + for fl in all_files: + for dt in date_slist: + if fl.find(dt) != -1: + filenames.append(fl) + return filenames + +def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + for geo_sig in geo_sig_cmbo: + df_list = list() + + files = list(filter(lambda x: geo_sig[0] in x and geo_sig[1] in x, filenames)) + if(len(files) == 0): + print("FILE_NOT_FOUND: File with geo_type:", geo_sig[0], " and signal:", geo_sig[1], " does not exist!") + for f in files: + df = pd.read_csv( + data_folder / f, + dtype={'geo_id': str, + 'val': float, + 'se': float, + 'sample_size': float, + 'effective_sample_size': float + }) + for dt in date_slist: + if f.find(dt) != -1: + gen_dt = datetime.strptime(dt, '%Y%m%d') + df['time_value'] = gen_dt + df_list.append(df) + yield pd.concat(df_list) def fetch_daily_data(data_source, survey_date, geo_type, signal): data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) diff --git a/validator/driver.py b/validator/driver.py index cef2362de..7556843fe 100644 --- a/validator/driver.py +++ b/validator/driver.py @@ -3,7 +3,7 @@ # Defining start date and end date for the last fb-survey pipeline execution survey_sdate = "2020-06-13" -survey_edate = "2020-06-20" +survey_edate = "2020-06-19" dtobj_sdate = datetime.strptime(survey_sdate, '%Y-%m-%d') dtobj_edate = datetime.strptime(survey_edate, '%Y-%m-%d') print(dtobj_sdate.date()) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 006c5a9be..7d5f88636 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -2,6 +2,7 @@ import re import pandas as pd from pathlib import Path +from itertools import product from datetime import date, datetime, timedelta from datafetcher import * @@ -62,7 +63,7 @@ def find_all_unexpected_geo_ids(df_to_test, negated_regex): 'county': '^(?!\d{5}).*$', 'hrr': '^(?!\d{1,3}).*$', 'msa': '^(?!\d{5}).*$', - 'state': '^(?![A-Z]{2}).*$' + 'state': '^(?![A-Z]{2}).*$', 'national': '(?!usa).*$' } @@ -135,8 +136,58 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): def fbsurvey_validation(daily_filenames, sdate, edate): + + meta = covidcast.metadata() + fb_meta = meta[meta['data_source']==DATA_SOURCE] + unique_signals = fb_meta['signal'].unique().tolist() + unique_geotypes = fb_meta['geo_type'].unique().tolist() + + ##### Currently metadata returns --*community*-- signals that don't get generated + ##### in the new fb-pipeline. Seiving them out for now. + # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli + for sig in unique_signals: + if "community" in sig: + unique_signals.remove(sig) + + geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) + print(geo_sig_cmbo) + print("Number of mixed types:", len(geo_sig_cmbo)) + + for cmb in geo_sig_cmbo: + print(cmb) + + ## The following dates refer to the newly generated files from latest pipeline execution + ######----start_date-----####### + ######----end_date------####### + #start_date = date(2020, 6, 16) + #end_date = date(2020, 6, 19) + delta_days = (edate + timedelta(days=1)) - sdate + print("Number of days: ", delta_days.days) + date_list = [sdate + timedelta(days=x) for x in range(delta_days.days)] + print(date_list) + date_slist = [dt.strftime("%Y%m%d") for dt in date_list] + print(date_slist) + data_folder = Path("data/") + + filenames = read_relevant_date_filenames(data_folder, date_slist) + + # Multi-indexed dataframe for a given (signal, geo_type) + + kroc = 0 + for recent_df in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + print(recent_df) + kroc += 1 + if kroc == 2: + break + sys.exit() + + + + + + check_missing_dates(daily_filenames, sdate, edate) From 14fb6ce3e3872d1dcc7899fedfecc0887ed9c184 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 30 Jul 2020 10:08:14 -0400 Subject: [PATCH 014/151] Rapid rise check --- validator/datafetcher.py | 2 +- validator/fbsurveyvalidation.py | 63 +++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/validator/datafetcher.py b/validator/datafetcher.py index bd024098d..dac15c63f 100644 --- a/validator/datafetcher.py +++ b/validator/datafetcher.py @@ -42,7 +42,7 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): gen_dt = datetime.strptime(dt, '%Y%m%d') df['time_value'] = gen_dt df_list.append(df) - yield pd.concat(df_list) + yield pd.concat(df_list), geo_sig[0], geo_sig[1] def fetch_daily_data(data_source, survey_date, geo_type, signal): data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 7d5f88636..523e90dce 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -134,8 +134,22 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): sys.exit("latest date of generated file seems too long ago") return +def reldiff_by_min(x, y): + return (x - y) / min(x,y) -def fbsurvey_validation(daily_filenames, sdate, edate): +def check_rapid_change(start_date, end_date, max_check_lookbehind): + #base = datetime.datetime.today() + base = start_date + date_list = [base - timedelta(days=x) for x in range(n_past_days)] + #print(date_list) + + +# The daterange function is exclusive of the end_date in line with the native python range() + for check_date in daterange(start_date, end_date): + print(check_date.strftime("%Y-%m-%d")) + + +def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True): meta = covidcast.metadata() fb_meta = meta[meta['data_source']==DATA_SOURCE] @@ -175,12 +189,55 @@ def fbsurvey_validation(daily_filenames, sdate, edate): # Multi-indexed dataframe for a given (signal, geo_type) + ## recent_lookbehind: start from the check date and working backward in time, + ## how many days do we include in the window of date to check for anomalies? + ## Choosing 1 day checks just the check data itself. + recent_lookbehind = timedelta(days=1) + + ## semirecent_lookbehind: starting from the check date and working backward + ## in time, how many days -- before subtracting out the "recent" days --- + ## do we use to form the reference statistics? + semirecent_lookbehind = timedelta(days=7) + kroc = 0 - for recent_df in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): print(recent_df) + #recent_df.set_index("time_value", inplace = True) + + for checking_date in date_list: + #print(recent_df.loc[checking_date,:]) + # -recent- dataframe run backwards from the checking_date + recent_end_date = checking_date - timedelta(days=1) + recent_begin_date = checking_date - max_check_lookbehind + recent_api_df = covidcast.signal(DATA_SOURCE, sig, recent_begin_date, recent_end_date, geo) + print("Checking recent_api_df") + + recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) + recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) + + column_names = ["geo_value", "val", "se", "sample_size", "time_value"] + + recent_api_df = recent_api_df.reindex(columns=column_names) + print(recent_api_df) + print(recent_api_df.dtypes) + if (recent_df["se"].isnull().mean() > 0.5): + print('Recent se values are >50% NA') + + recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] + print(recent_rows_per_reporting_day) + recent_api_rows_per_reporting_day = recent_df.shape[0] / len(date_list) + print("recent_api_rows_per_reporting_day", recent_api_rows_per_reporting_day) + + if(sanity_check_rows_per_day and abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): + print("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") + print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo) + kroc += 1 - if kroc == 2: + if kroc == 2: break + + + sys.exit() From 2da91383db34dabbcebf32faa55c9e0454730fe3 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 30 Jul 2020 10:48:24 -0400 Subject: [PATCH 015/151] Minor change --- validator/fbsurveyvalidation.py | 43 ++++++++++----------------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/validator/fbsurveyvalidation.py b/validator/fbsurveyvalidation.py index 523e90dce..734f99d1b 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/fbsurveyvalidation.py @@ -137,11 +137,13 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): def reldiff_by_min(x, y): return (x - y) / min(x,y) -def check_rapid_change(start_date, end_date, max_check_lookbehind): - #base = datetime.datetime.today() - base = start_date - date_list = [base - timedelta(days=x) for x in range(n_past_days)] - #print(date_list) +def check_rapid_change(recent_df, recent_api_df, date_list): + recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] + recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list) + + if(sanity_check_rows_per_day and abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): + print("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") + print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo) # The daterange function is exclusive of the end_date in line with the native python range() @@ -201,50 +203,31 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti kroc = 0 for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): - print(recent_df) - #recent_df.set_index("time_value", inplace = True) + #recent_df.set_index("time_value", inplace = True) + print("Printing recent_df scenes:", recent_df.shape) + print(recent_df) for checking_date in date_list: #print(recent_df.loc[checking_date,:]) - # -recent- dataframe run backwards from the checking_date + # -recent- dataframe run backwards from the checking_date recent_end_date = checking_date - timedelta(days=1) recent_begin_date = checking_date - max_check_lookbehind recent_api_df = covidcast.signal(DATA_SOURCE, sig, recent_begin_date, recent_end_date, geo) - print("Checking recent_api_df") - + recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) column_names = ["geo_value", "val", "se", "sample_size", "time_value"] recent_api_df = recent_api_df.reindex(columns=column_names) - print(recent_api_df) - print(recent_api_df.dtypes) if (recent_df["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') - recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] - print(recent_rows_per_reporting_day) - recent_api_rows_per_reporting_day = recent_df.shape[0] / len(date_list) - print("recent_api_rows_per_reporting_day", recent_api_rows_per_reporting_day) - - if(sanity_check_rows_per_day and abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): - print("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") - print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo) - + check_rapid_change(recent_df, recent_api_df, date_list) kroc += 1 if kroc == 2: break - - - sys.exit() - - - - - - check_missing_dates(daily_filenames, sdate, edate) From 1d378c23423a9b7c0de4747574ed0a10b78b05a3 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Thu, 30 Jul 2020 13:36:41 -0400 Subject: [PATCH 016/151] Package compliance --- validator/README.md | 57 +++++++++++++++++++ validator/REVIEW.md | 39 +++++++++++++ validator/delphi_validator/__init__.py | 13 +++++ validator/delphi_validator/__main__.py | 11 ++++ .../{ => delphi_validator}/datafetcher.py | 0 validator/{ => delphi_validator}/driver.py | 0 validator/{ => delphi_validator}/errors.py | 0 .../fbsurveyvalidation.py | 23 -------- validator/delphi_validator/run.py | 14 +++++ validator/params.json.template | 6 ++ validator/setup.py | 27 +++++++++ 11 files changed, 167 insertions(+), 23 deletions(-) create mode 100644 validator/README.md create mode 100644 validator/REVIEW.md create mode 100644 validator/delphi_validator/__init__.py create mode 100644 validator/delphi_validator/__main__.py rename validator/{ => delphi_validator}/datafetcher.py (100%) rename validator/{ => delphi_validator}/driver.py (100%) rename validator/{ => delphi_validator}/errors.py (100%) rename validator/{ => delphi_validator}/fbsurveyvalidation.py (93%) create mode 100644 validator/delphi_validator/run.py create mode 100644 validator/params.json.template create mode 100644 validator/setup.py diff --git a/validator/README.md b/validator/README.md new file mode 100644 index 000000000..3e7d5f98f --- /dev/null +++ b/validator/README.md @@ -0,0 +1,57 @@ +# Validator + + + +## Running the Indicator + +The indicator is run by directly executing the Python module contained in this +directory. The safest way to do this is to create a virtual environment, +installed the common DELPHI tools, and then install the module and its +dependencies. To do this, run the following code from this directory: + +``` +python -m venv env +source env/bin/activate +pip install ../_delphi_utils_python/. +pip install . +``` + +All of the user-changable parameters are stored in `params.json`. To execute +the module and produce the output datasets (by default, in `receiving`), run +the following: + +``` +env/bin/python -m delphi_NAME +``` + +Once you are finished with the code, you can deactivate the virtual environment +and (optionally) remove the environment itself. + +``` +deactivate +rm -r env +``` + +## Testing the code + +To do a static test of the code style, it is recommended to run **pylint** on +the module. To do this, run the following from the main module directory: + +``` +env/bin/pylint delphi_NAME +``` + +The most aggressive checks are turned off; only relatively important issues +should be raised and they should be manually checked (or better, fixed). + +Unit tests are also included in the module. To execute these, run the following +command from this directory: + +``` +(cd tests && ../env/bin/pytest --cov=delphi_NAME --cov-report=term-missing) +``` + +The output will show the number of unit tests that passed and failed, along +with the percentage of code covered by the tests. None of the tests should +fail and the code lines that are not covered by unit tests should be small and +should not include critical sub-routines. diff --git a/validator/REVIEW.md b/validator/REVIEW.md new file mode 100644 index 000000000..93a5a6579 --- /dev/null +++ b/validator/REVIEW.md @@ -0,0 +1,39 @@ +## Code Review (Python) + +A code review of this module should include a careful look at the code and the +output. To assist in the process, but certainly not in replace of it, please +check the following items. + +**Documentation** + +- [ ] the README.md file template is filled out and currently accurate; it is +possible to load and test the code using only the instructions given +- [ ] minimal docstrings (one line describing what the function does) are +included for all functions; full docstrings describing the inputs and expected +outputs should be given for non-trivial functions + +**Structure** + +- [ ] code should use 4 spaces for indentation; other style decisions are +flexible, but be consistent within a module +- [ ] any required metadata files are checked into the repository and placed +within the directory `static` +- [ ] any intermediate files that are created and stored by the module should +be placed in the directory `cache` +- [ ] final expected output files to be uploaded to the API are placed in the +`receiving` directory; output files should not be committed to the respository +- [ ] all options and API keys are passed through the file `params.json` +- [ ] template parameter file (`params.json.template`) is checked into the +code; no personal (i.e., usernames) or private (i.e., API keys) information is +included in this template file + +**Testing** + +- [ ] module can be installed in a new virtual environment +- [ ] pylint with the default `.pylint` settings run over the module produces +minimal warnings; warnings that do exist have been confirmed as false positives +- [ ] reasonably high level of unit test coverage covering all of the main logic +of the code (e.g., missing coverage for raised errors that do not currently seem +possible to reach are okay; missing coverage for options that will be needed are +not) +- [ ] all unit tests run without errors diff --git a/validator/delphi_validator/__init__.py b/validator/delphi_validator/__init__.py new file mode 100644 index 000000000..52a507259 --- /dev/null +++ b/validator/delphi_validator/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""Module to pull and clean indicators from the XXXXX source. + +This file defines the functions that are made public by the module. As the +module is intended to be executed though the main method, these are primarily +for testing. +""" + +from __future__ import absolute_import + +from . import run + +__version__ = "0.1.0" diff --git a/validator/delphi_validator/__main__.py b/validator/delphi_validator/__main__.py new file mode 100644 index 000000000..bf03405fe --- /dev/null +++ b/validator/delphi_validator/__main__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +"""Call the function run_module when executed. + +This file indicates that calling the module (`python -m MODULE_NAME`) will +call the function `run_module` found within the run.py file. There should be +no need to change this template. +""" + +from .run import run_module # pragma: no cover + +run_module() # pragma: no cover diff --git a/validator/datafetcher.py b/validator/delphi_validator/datafetcher.py similarity index 100% rename from validator/datafetcher.py rename to validator/delphi_validator/datafetcher.py diff --git a/validator/driver.py b/validator/delphi_validator/driver.py similarity index 100% rename from validator/driver.py rename to validator/delphi_validator/driver.py diff --git a/validator/errors.py b/validator/delphi_validator/errors.py similarity index 100% rename from validator/errors.py rename to validator/delphi_validator/errors.py diff --git a/validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py similarity index 93% rename from validator/fbsurveyvalidation.py rename to validator/delphi_validator/fbsurveyvalidation.py index 734f99d1b..6bef2310f 100644 --- a/validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -24,29 +24,6 @@ def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_c sys.exit('=nameformat= not recognized as a daily format') - -def main(): - print("Inside main") - df_to_test = pd.read_csv( - "data/20200613_county_raw_cli.csv", - dtype={'geo_id': str, - 'val': float, - 'se': float, - 'sample_size': float, - 'effective_sample_size': float - }) - - print(df_to_test.head()) - print(df_to_test.describe()) - - result = df_to_test.dtypes - print(result) - - sys.exit() - - #validate_daily(df_to_test, nameformat, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working) - print(date.today()) - def check_bad_geo_id(df_to_test, geo_type): if geo_type not in negated_regex_dict: print("Unrecognized geo type:", geo_type ) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py new file mode 100644 index 000000000..d137d8e77 --- /dev/null +++ b/validator/delphi_validator/run.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +"""Functions to call when running the function. + +This module should contain a function called `run_module`, that is executed +when the module is run with `python -m MODULE_NAME`. +""" +import numpy as np +import pandas as pd +from delphi_utils import read_params + + +def run_module(): + + params = read_params() diff --git a/validator/params.json.template b/validator/params.json.template new file mode 100644 index 000000000..02da51d1c --- /dev/null +++ b/validator/params.json.template @@ -0,0 +1,6 @@ +{ + "start_date": "2020-06-13", + "end_date": "2020-06-19", + "ref_window_size": 7 + "data_folder": "/data/" +} \ No newline at end of file diff --git a/validator/setup.py b/validator/setup.py new file mode 100644 index 000000000..4ece26fec --- /dev/null +++ b/validator/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup +from setuptools import find_packages + +required = [ + "numpy", + "pandas", + "pytest", + "pytest-cov", + "pylint", + "delphi-utils" +] + +setup( + name="delphi_validator", + version="0.1.0", + description="Validates newly generated daily-data against previously issued data", + author="", + author_email="", + url="https://github.com/cmu-delphi/covidcast-indicators", + install_requires=required, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3.7", + ], + packages=find_packages(), +) \ No newline at end of file From c5edfd14b1b4337f253e71c98cb607a5f83ae38c Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Tue, 4 Aug 2020 08:36:41 -0400 Subject: [PATCH 017/151] Average diff check correction --- .../delphi_validator/fbsurveyvalidation.py | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py index 6bef2310f..9b7047eda 100644 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -114,21 +114,36 @@ def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): def reldiff_by_min(x, y): return (x - y) / min(x,y) -def check_rapid_change(recent_df, recent_api_df, date_list): +def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo): recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list) - if(sanity_check_rows_per_day and abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): + if(abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): print("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo) +def check_avg_val_diffs(recent_df, recent_api_df): + print("recent_df dtypes", recent_df.dtypes) + recent_df = recent_df.drop(columns=['geo_id']) + mean_recent_df = recent_df.mean() + recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() + recent_api_df = recent_api_df.drop(columns=['geo_value']) + mean_recent_api_df = recent_api_df.mean() + + #mean.stddiff = (mean(recent-semirecent)*2/(mean(recent)+mean(semirecent))) + mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) + mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) + print("mean_stddiff", mean_stddiff) + print("mean_stdabsdiff", mean_stdabsdiff) + + # The daterange function is exclusive of the end_date in line with the native python range() - for check_date in daterange(start_date, end_date): - print(check_date.strftime("%Y-%m-%d")) +# for check_date in daterange(start_date, end_date): +# print(check_date.strftime("%Y-%m-%d")) -def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True): +def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): meta = covidcast.metadata() fb_meta = meta[meta['data_source']==DATA_SOURCE] @@ -162,7 +177,7 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti date_slist = [dt.strftime("%Y%m%d") for dt in date_list] print(date_slist) - data_folder = Path("data/") + data_folder = Path("../data") filenames = read_relevant_date_filenames(data_folder, date_slist) @@ -200,7 +215,11 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti if (recent_df["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') - check_rapid_change(recent_df, recent_api_df, date_list) + #if sanity_check_rows_per_day: + # check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) + + if sanity_check_value_diffs: + check_avg_val_diffs(recent_df, recent_api_df) kroc += 1 if kroc == 2: break From 9d183634f0e9c6277b7f488ead5f0e8ce2ead727 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Tue, 4 Aug 2020 13:21:30 -0400 Subject: [PATCH 018/151] Adding smoothing thresholds --- .../delphi_validator/fbsurveyvalidation.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py index 9b7047eda..1c51d6a16 100644 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -123,19 +123,27 @@ def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo) -def check_avg_val_diffs(recent_df, recent_api_df): +def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): print("recent_df dtypes", recent_df.dtypes) recent_df = recent_df.drop(columns=['geo_id']) - mean_recent_df = recent_df.mean() + mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean() recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() recent_api_df = recent_api_df.drop(columns=['geo_value']) + mean_recent_api_df = recent_api_df.mean() - #mean.stddiff = (mean(recent-semirecent)*2/(mean(recent)+mean(semirecent))) mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) print("mean_stddiff", mean_stddiff) print("mean_stdabsdiff", mean_stdabsdiff) + + classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] + raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) + + smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) + + + # The daterange function is exclusive of the end_date in line with the native python range() @@ -192,10 +200,14 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti ## in time, how many days -- before subtracting out the "recent" days --- ## do we use to form the reference statistics? semirecent_lookbehind = timedelta(days=7) + smooth_option_regex = re.compile(r'([^_]+)') kroc = 0 for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + m = smooth_option_regex.match(sig) + smooth_option = m.group(1) + #recent_df.set_index("time_value", inplace = True) print("Printing recent_df scenes:", recent_df.shape) print(recent_df) @@ -219,7 +231,7 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti # check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) if sanity_check_value_diffs: - check_avg_val_diffs(recent_df, recent_api_df) + check_avg_val_diffs(recent_df, recent_api_df, smooth_option) kroc += 1 if kroc == 2: break From 45986ad1c71c8761247c36ef673d42b84f49fc1e Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Tue, 4 Aug 2020 13:47:41 -0400 Subject: [PATCH 019/151] Edits for avg diff check --- .../delphi_validator/fbsurveyvalidation.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py index 1c51d6a16..901aab895 100644 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -142,15 +142,30 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) - - + # Code reference from R code + # changesum.by.variable.with.flags = changesum.by.variable %>>% + # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | + # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]], + # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>% + + switcher = { + 'raw': raw_thresholds, + 'smoothed': smoothed_thresholds, + } + # Get the function from switcher dictionary + thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") + + mean.stddiff.high = mean_stddiff.abs() > thres.loc['mean.stddiff'] or + mean_stddiff.abs() > thres.loc['val.mean.stddiff"'] + mean.stdabsdiff.high = mean_stdabsdiff > thres.loc['mean.stdabsdiff'] + + if mean.stddiff.high or mean.stdabsdiff.high: + print('Average differences in variables by geoid between recent & semirecent data seem \ + large --- either large increase tending toward one direction or large mean absolute \ + difference, relative to average values of corresponding variables. For the former \ + check, tolerances for `val` are more restrictive than those for other columns.'') -# The daterange function is exclusive of the end_date in line with the native python range() -# for check_date in daterange(start_date, end_date): -# print(check_date.strftime("%Y-%m-%d")) - - def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): meta = covidcast.metadata() From b8eae79654003b96d5597bd0b250de5ffccb43ec Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Tue, 4 Aug 2020 14:04:05 -0400 Subject: [PATCH 020/151] Readme edits --- validator/README.md | 10 ++++++++++ validator/delphi_validator/driver.py | 4 ++-- validator/delphi_validator/fbsurveyvalidation.py | 5 +++-- validator/params.json.template | 1 + 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/validator/README.md b/validator/README.md index 3e7d5f98f..85bc80dc5 100644 --- a/validator/README.md +++ b/validator/README.md @@ -1,5 +1,15 @@ # Validator +The validator performs two main tasks: +1) Sanity checks on daily data generated from a pipeline of specific data + source. +2) Its does a comparative analysis with recent data from the API + to detect any anomalies such as spikes, significant value differences + +The validator validates against daily data thats already written in the disk +making the execution of the validator independent of the pipeline execution. +This creates an additional advantage of running the validation against multiple +days of daily data and have a better cummulative analysis. ## Running the Indicator diff --git a/validator/delphi_validator/driver.py b/validator/delphi_validator/driver.py index 7556843fe..ea213e635 100644 --- a/validator/delphi_validator/driver.py +++ b/validator/delphi_validator/driver.py @@ -3,7 +3,7 @@ # Defining start date and end date for the last fb-survey pipeline execution survey_sdate = "2020-06-13" -survey_edate = "2020-06-19" +survey_edate = "2020-06-15" dtobj_sdate = datetime.strptime(survey_sdate, '%Y-%m-%d') dtobj_edate = datetime.strptime(survey_edate, '%Y-%m-%d') print(dtobj_sdate.date()) @@ -11,6 +11,6 @@ # Collecting all filenames -daily_filnames = read_filenames("./data") +daily_filnames = read_filenames("../data") fbsurvey_validation(daily_filnames, dtobj_sdate, dtobj_edate) diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py index 901aab895..4e4a97aa6 100644 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -147,6 +147,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]], # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>% + # TOdo - Check whats the purpose of variable=="val" in the above statement switcher = { 'raw': raw_thresholds, @@ -242,8 +243,8 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti if (recent_df["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') - #if sanity_check_rows_per_day: - # check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) + if sanity_check_rows_per_day: + check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) if sanity_check_value_diffs: check_avg_val_diffs(recent_df, recent_api_df, smooth_option) diff --git a/validator/params.json.template b/validator/params.json.template index 02da51d1c..b801937c6 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,4 +1,5 @@ { + "data_source": "fb_survey" "start_date": "2020-06-13", "end_date": "2020-06-19", "ref_window_size": 7 From c8a5b7a8935a744d7dcb6f91862b4ce918213f11 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Fri, 7 Aug 2020 18:58:41 -0400 Subject: [PATCH 021/151] Tweaks to fetch metadata based on old/new pipeline. --- validator/delphi_validator/datafetcher.py | 85 +++++++++++++---- validator/delphi_validator/driver.py | 8 +- .../delphi_validator/fbsurveyvalidation.py | 94 +++++-------------- validator/params.json.template | 5 +- 4 files changed, 100 insertions(+), 92 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index dac15c63f..763e04f55 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -5,6 +5,71 @@ import pandas as pd from datetime import date, datetime, timedelta from errors import * +import re +from typing import List +import json + +def read_params(params_filepath): + with open(params_filepath) as f: + config = json.load(f) + return config + +def get_filenames_with_geo_signal(path, date_slist: List[str]): + + if pipeline_version == 'new': + meta = covidcast.metadata() + fb_meta = meta[meta['data_source']==DATA_SOURCE] + unique_signals = fb_meta['signal'].unique().tolist() + unique_geotypes = fb_meta['geo_type'].unique().tolist() + + + ##### Currently metadata returns --*community*-- signals that don't get generated + ##### in the new fb-pipeline. Seiving them out for now. + # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli + for sig in unique_signals: + if "community" in sig: + unique_signals.remove(sig) + + + geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) + print(geo_sig_cmbo) + print("Number of mixed types:", len(geo_sig_cmbo)) + + for cmb in geo_sig_cmbo: + print(cmb) + + + filenames = read_relevant_date_filenames(data_folder, date_slist[0]) + + else: + sdate = date_slist[0] + filenames = [f for f in listdir(path) if isfile(join(path, f))] + + sdate_filenames = [fname for fname in filenames if fname.find(sdate) != -1] + + # example: 20200624_county_smoothed_nohh_cmnty_cli + filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$') + geo_sig_cmbo = list() + for f in sdate_filenames: + + m = filename_regex.match(f) + if (not m.group(0)): + print('=nameformat= not recognized as a daily format') + + geo_type = m.group(2) + + + if m.group(4): # weighted data 'w' + signal = "".join([m.group(4), m.group(5)]) + signal = "_".join([m.group(3), signal]) + # max_weighted_date = survey_date + else: + signal = "_".join([m.group(3), m.group(5)]) + # max_date = survey_date + + geo_sig_cmbo.append((geo_type, signal)) + + return filenames, geo_sig_cmbo def read_filenames(path): @@ -12,7 +77,7 @@ def read_filenames(path): return daily_filenames def read_relevant_date_filenames(data_path, date_slist): - all_files = listdir(data_path) + all_files = [f for f in listdir(path) if isfile(join(data_path, f))] filenames = list() for fl in all_files: @@ -69,9 +134,6 @@ def new_stuff(): date_seq = {dtobj_sdate + timedelta(days=x) for x in range(number_of_dates.days + 1)} print(date_seq) - # 1) Lets first fetch all daily filenames - - data = covidcast.signal("fb-survey", "raw_ili", date(2020, 6, 19), date(2020, 6, 19), "state") @@ -90,17 +152,4 @@ def new_stuff(): if check_dateholes: print("Date holes exist!") print(check_dateholes) - - - - - - -#print(data) -#print(data.dtypes) - -#print(type(data)) - -#meta = covidcast.metadata() -#meta.to_csv('meta_out.csv') -#print(meta) \ No newline at end of file + \ No newline at end of file diff --git a/validator/delphi_validator/driver.py b/validator/delphi_validator/driver.py index ea213e635..ba9f1a53d 100644 --- a/validator/delphi_validator/driver.py +++ b/validator/delphi_validator/driver.py @@ -4,11 +4,11 @@ # Defining start date and end date for the last fb-survey pipeline execution survey_sdate = "2020-06-13" survey_edate = "2020-06-15" -dtobj_sdate = datetime.strptime(survey_sdate, '%Y-%m-%d') -dtobj_edate = datetime.strptime(survey_edate, '%Y-%m-%d') -print(dtobj_sdate.date()) -print(dtobj_edate.date()) +# Dev Alert: Remove template extention +params = read_params("..\params.json.template") +dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') +dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') # Collecting all filenames daily_filnames = read_filenames("../data") diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py index 4e4a97aa6..a434bdb98 100644 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -1,10 +1,12 @@ import sys import re import pandas as pd +import numpy as np from pathlib import Path from itertools import product from datetime import date, datetime, timedelta from datafetcher import * +import math DATA_SOURCE = "fb-survey" @@ -136,6 +138,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) print("mean_stddiff", mean_stddiff) print("mean_stdabsdiff", mean_stdabsdiff) + print("type(mean_stdabsdiff)",type(mean_stdabsdiff)) classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) @@ -147,7 +150,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]], # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>% - # TOdo - Check whats the purpose of variable=="val" in the above statement + # Todo - Check whats the purpose of variable=="val" in the above statement switcher = { 'raw': raw_thresholds, @@ -156,38 +159,19 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): # Get the function from switcher dictionary thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - mean.stddiff.high = mean_stddiff.abs() > thres.loc['mean.stddiff'] or - mean_stddiff.abs() > thres.loc['val.mean.stddiff"'] - mean.stdabsdiff.high = mean_stdabsdiff > thres.loc['mean.stdabsdiff'] + print(np.absolute(mean_stddiff) > thres.loc['mean.stddiff']) + mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() + mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool() - if mean.stddiff.high or mean.stdabsdiff.high: - print('Average differences in variables by geoid between recent & semirecent data seem \ - large --- either large increase tending toward one direction or large mean absolute \ - difference, relative to average values of corresponding variables. For the former \ - check, tolerances for `val` are more restrictive than those for other columns.'') - -def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): - - meta = covidcast.metadata() - fb_meta = meta[meta['data_source']==DATA_SOURCE] - unique_signals = fb_meta['signal'].unique().tolist() - unique_geotypes = fb_meta['geo_type'].unique().tolist() - - ##### Currently metadata returns --*community*-- signals that don't get generated - ##### in the new fb-pipeline. Seiving them out for now. - # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli - for sig in unique_signals: - if "community" in sig: - unique_signals.remove(sig) + if mean_stddiff_high or mean_stdabsdiff_high: + print('Average differences in variables by geoid between recent & semirecent data seem' \ + + 'large --- either large increase tending toward one direction or large mean absolute' \ + + 'difference, relative to average values of corresponding variables. For the former' \ + + 'check, tolerances for `val` are more restrictive than those for other columns.') - geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) - print(geo_sig_cmbo) - print("Number of mixed types:", len(geo_sig_cmbo)) - - for cmb in geo_sig_cmbo: - print(cmb) +def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): ## The following dates refer to the newly generated files from latest pipeline execution ######----start_date-----####### @@ -201,9 +185,15 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti date_slist = [dt.strftime("%Y%m%d") for dt in date_list] print(date_slist) + filenames, geo_sig = get_filenames_with_geo_signal(data_folder, date_slist) + data_folder = Path("../data") - filenames = read_relevant_date_filenames(data_folder, date_slist) + # Read metadata from filename + ## Red ALERT: Repeating filenames + sys.exit() + # + # Multi-indexed dataframe for a given (signal, geo_type) @@ -262,44 +252,12 @@ def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = ti # raw_wili # raw_hh_cmnty_cli # raw_nohh_cmnty_cli - i = 1 - filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$') - for f in daily_filenames: - # example: 20200624_county_smoothed_nohh_cmnty_cli - - df_to_test = pd.read_csv( - data_folder / f, - dtype={'geo_id': str, 'val': float, 'se': float, 'sample_size': float, 'effective_sample_size': float - }) - - - m = filename_regex.match(f) - survey_date = datetime.strptime(m.group(1), '%Y%m%d').date() - geo_type = m.group(2) - - if m.group(4): - signal = "".join([m.group(4), m.group(5)]) - signal = "_".join([m.group(3), signal]) - max_weighted_date = survey_date - else: - signal = "_".join([m.group(3), m.group(5)]) - max_date = survey_date - - if (not m.group(0)): - sys.exit('=nameformat= not recognized as a daily format') - - try: - df_ref = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) - except APIDataFetchError as e: - print("APIDataFetchError:", e) - print("\n") - - - print(df_to_validate) - - i += 1 - if i == 2: - break + + # try: + # df_ref = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) + # except APIDataFetchError as e: + # print("APIDataFetchError:", e) + # print("\n") check_min_allowed_max_date(generation_date, max_date, max_weighted_date) check_bad_geo_id(df_to_test, geo_type) diff --git a/validator/params.json.template b/validator/params.json.template index b801937c6..f47b0b07c 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,7 +1,8 @@ { - "data_source": "fb_survey" + "data_source": "fb_survey", + "pipeline_version": "old", "start_date": "2020-06-13", "end_date": "2020-06-19", - "ref_window_size": 7 + "ref_window_size": 7, "data_folder": "/data/" } \ No newline at end of file From 7b23269816f1687c1830c4ed80ec108ef56e3d8e Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Fri, 7 Aug 2020 19:01:27 -0400 Subject: [PATCH 022/151] Removing prototype code. --- validator/delphi_validator/driver.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/validator/delphi_validator/driver.py b/validator/delphi_validator/driver.py index ba9f1a53d..ee1bb7811 100644 --- a/validator/delphi_validator/driver.py +++ b/validator/delphi_validator/driver.py @@ -1,10 +1,6 @@ from fbsurveyvalidation import * from datafetcher import * -# Defining start date and end date for the last fb-survey pipeline execution -survey_sdate = "2020-06-13" -survey_edate = "2020-06-15" - # Dev Alert: Remove template extention params = read_params("..\params.json.template") dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') From 6cae2b45542fb6a7c76307d8ae3dd69fdd0a5585 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Mon, 10 Aug 2020 12:48:45 -0400 Subject: [PATCH 023/151] Fixing module build errors --- validator/delphi_validator/datafetcher.py | 12 +----------- validator/delphi_validator/driver.py | 12 ------------ validator/delphi_validator/fbsurveyvalidation.py | 2 +- validator/delphi_validator/run.py | 14 +++++++++++++- validator/params.json.template | 2 +- validator/setup.py | 3 ++- 6 files changed, 18 insertions(+), 27 deletions(-) delete mode 100644 validator/delphi_validator/driver.py diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 763e04f55..87c41d657 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -4,15 +4,11 @@ import covidcast import pandas as pd from datetime import date, datetime, timedelta -from errors import * +from .errors import * import re from typing import List import json -def read_params(params_filepath): - with open(params_filepath) as f: - config = json.load(f) - return config def get_filenames_with_geo_signal(path, date_slist: List[str]): @@ -121,12 +117,6 @@ def fetch_daily_data(data_source, survey_date, geo_type, signal): def new_stuff(): - survey_sdate = "2020-06-13" - survey_edate = "2020-06-20" - dtobj_sdate = datetime.strptime(survey_sdate, '%Y-%m-%d') - dtobj_edate = datetime.strptime(survey_edate, '%Y-%m-%d') - print(dtobj_sdate.date()) - print(dtobj_edate.date()) number_of_dates = dtobj_edate - dtobj_sdate + timedelta(days=1) print(number_of_dates) diff --git a/validator/delphi_validator/driver.py b/validator/delphi_validator/driver.py deleted file mode 100644 index ee1bb7811..000000000 --- a/validator/delphi_validator/driver.py +++ /dev/null @@ -1,12 +0,0 @@ -from fbsurveyvalidation import * -from datafetcher import * - -# Dev Alert: Remove template extention -params = read_params("..\params.json.template") -dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') -dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') - -# Collecting all filenames -daily_filnames = read_filenames("../data") - -fbsurvey_validation(daily_filnames, dtobj_sdate, dtobj_edate) diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py index a434bdb98..426f67fd5 100644 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ b/validator/delphi_validator/fbsurveyvalidation.py @@ -5,7 +5,7 @@ from pathlib import Path from itertools import product from datetime import date, datetime, timedelta -from datafetcher import * +from .datafetcher import * import math DATA_SOURCE = "fb-survey" diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index d137d8e77..220116468 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -2,13 +2,25 @@ """Functions to call when running the function. This module should contain a function called `run_module`, that is executed -when the module is run with `python -m MODULE_NAME`. +when the module is run with `python -m delphi_validator`. """ import numpy as np import pandas as pd from delphi_utils import read_params +from .fbsurveyvalidation import fbsurvey_validation +from .datafetcher import * def run_module(): params = read_params() + + + dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') + dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') + max_check_lookbehind = int(params["ref_window_size"]) + + # Collecting all filenames + daily_filnames = read_filenames(params["data_folder"]) + + fbsurvey_validation(daily_filnames, dtobj_sdate, dtobj_edate) diff --git a/validator/params.json.template b/validator/params.json.template index f47b0b07c..97daa91c6 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -4,5 +4,5 @@ "start_date": "2020-06-13", "end_date": "2020-06-19", "ref_window_size": 7, - "data_folder": "/data/" + "data_folder": "C:/Covidcast/covidcast-indicators/validator/data" } \ No newline at end of file diff --git a/validator/setup.py b/validator/setup.py index 4ece26fec..2ac236570 100644 --- a/validator/setup.py +++ b/validator/setup.py @@ -7,7 +7,8 @@ "pytest", "pytest-cov", "pylint", - "delphi-utils" + "delphi-utils", + "covidcast" ] setup( From d27b1045d3e3adb0428ef76eca2a837ba8e221da Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Mon, 10 Aug 2020 13:03:33 -0400 Subject: [PATCH 024/151] module name change in readme. --- validator/README.md | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/validator/README.md b/validator/README.md index 85bc80dc5..3591bb0e1 100644 --- a/validator/README.md +++ b/validator/README.md @@ -31,7 +31,7 @@ the module and produce the output datasets (by default, in `receiving`), run the following: ``` -env/bin/python -m delphi_NAME +env/bin/python -m delphi_validator ``` Once you are finished with the code, you can deactivate the virtual environment @@ -41,27 +41,3 @@ and (optionally) remove the environment itself. deactivate rm -r env ``` - -## Testing the code - -To do a static test of the code style, it is recommended to run **pylint** on -the module. To do this, run the following from the main module directory: - -``` -env/bin/pylint delphi_NAME -``` - -The most aggressive checks are turned off; only relatively important issues -should be raised and they should be manually checked (or better, fixed). - -Unit tests are also included in the module. To execute these, run the following -command from this directory: - -``` -(cd tests && ../env/bin/pytest --cov=delphi_NAME --cov-report=term-missing) -``` - -The output will show the number of unit tests that passed and failed, along -with the percentage of code covered by the tests. None of the tests should -fail and the code lines that are not covered by unit tests should be small and -should not include critical sub-routines. From 66091d1a1400ea6c1d3df2cac20af203f43ca9d7 Mon Sep 17 00:00:00 2001 From: amartyabasu Date: Mon, 10 Aug 2020 13:07:06 -0400 Subject: [PATCH 025/151] Clean up --- validator/delphi_validator/datafetcher.py | 30 +---------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 87c41d657..facda3f5e 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -114,32 +114,4 @@ def fetch_daily_data(data_source, survey_date, geo_type, signal): ", geography-type:" + geo_type raise APIDataFetchError(custom_msg) return data_to_validate - - -def new_stuff(): - - number_of_dates = dtobj_edate - dtobj_sdate + timedelta(days=1) - print(number_of_dates) - - date_seq = {dtobj_sdate + timedelta(days=x) for x in range(number_of_dates.days + 1)} - print(date_seq) - - data = covidcast.signal("fb-survey", "raw_ili", date(2020, 6, 19), date(2020, 6, 19), - "state") - - - unique_dates = set() - unique_dates_obj = set() - - for daily_filename in daily_filenames: - unique_dates.add(daily_filename[0:8]) - - for unique_date in unique_dates: - newdate_obj = datetime.strptime(unique_date, '%Y%m%d') - unique_dates_obj.add(newdate_obj) - - check_dateholes = date_seq.difference(unique_dates_obj) - if check_dateholes: - print("Date holes exist!") - print(check_dateholes) - \ No newline at end of file + \ No newline at end of file From 9c54f438084f75f85342616d0719cc1067be25cf Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Fri, 4 Sep 2020 16:24:22 -0400 Subject: [PATCH 026/151] [wip] validation package\n*replace print/exit with raised exceptions --- .../delphi_validator/fbsurveyvalidation.py | 267 ------------------ validator/delphi_validator/run.py | 11 +- validator/params.json.template | 6 +- 3 files changed, 8 insertions(+), 276 deletions(-) delete mode 100644 validator/delphi_validator/fbsurveyvalidation.py diff --git a/validator/delphi_validator/fbsurveyvalidation.py b/validator/delphi_validator/fbsurveyvalidation.py deleted file mode 100644 index 426f67fd5..000000000 --- a/validator/delphi_validator/fbsurveyvalidation.py +++ /dev/null @@ -1,267 +0,0 @@ -import sys -import re -import pandas as pd -import numpy as np -from pathlib import Path -from itertools import product -from datetime import date, datetime, timedelta -from .datafetcher import * -import math - -DATA_SOURCE = "fb-survey" - -#def validate_daily(df_to_test, nameformat, covidcast_reference_dfs, generation_date, max_check_lookbehind, sanity_check_rows_per_day, sanity_check_value_diffs, check_vs_working): -def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): - - # Perform some automated format and sanity checks of =df.to.test= - if(type(max_check_lookbehind) != int | len(str(max_check_look_behind) != 1)): - sys.exit(" =max_check_lookbehind= must be length 1, integer type") - - if( not isinstance(generation_date, datetime.date) or generation_date > date.today()): - sys.exit("=generation.date= must be a length 1 Date that is not in the future.") - # example: 20200624_county_smoothed_nohh_cmnty_cli - filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw|smoothed)_(\w*)([ci]li).csv$') - pattern_found = filename_regex.match(nameformat) - if (not nameformat or not pattern_found): - sys.exit('=nameformat= not recognized as a daily format') - - -def check_bad_geo_id(df_to_test, geo_type): - if geo_type not in negated_regex_dict: - print("Unrecognized geo type:", geo_type ) - sys.exit() - - def find_all_unexpected_geo_ids(df_to_test, negated_regex): - unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] - if(len(unexpected_geos) > 0): - print("Non-conforming geo_ids exist!") - print(unexpected_geos) - sys.exit() - - negated_regex_dict = { - 'county': '^(?!\d{5}).*$', - 'hrr': '^(?!\d{1,3}).*$', - 'msa': '^(?!\d{5}).*$', - 'state': '^(?![A-Z]{2}).*$', - 'national': '(?!usa).*$' - } - - find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - -def check_missing_dates(daily_filenames, sdate, edate): - number_of_dates = edate - sdate + timedelta(days=1) - date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} - unique_dates = set() - unique_dates_obj = set() - - for daily_filename in daily_filenames: - unique_dates.add(daily_filename[0:8]) - for unique_date in unique_dates: - newdate_obj = datetime.strptime(unique_date, '%Y%m%d') - unique_dates_obj.add(newdate_obj) - - check_dateholes = date_seq.difference(unique_dates_obj) - - if check_dateholes: - print("Missing dates are observed; if these dates are already in the API they would not be updated") - print(check_dateholes) - - return - -def check_bad_val(df_to_test): - if (not df_to_test[(df_to_test['val'] > 100)].empty): - print("val column can't have any cell greater than 100") - sys.exit() - if (df_to_test.isnull().values.any()): - print("val column can't have any cell set to null") - sys.exit() - if (not df_to_test[(df_to_test['val'] < 0)].empty): - print("val column can't have any cell smaller than 0") - sys.exit() - -def check_bad_se(df): - if (df['se'].isnull().values.any()): - print("se must not be NA") - sys.exit() - - df.eval('se_upper_limit = (val * effective_sample_size + 50)/(effective_sample_size + 1)', inplace=True) - - df['se']= df['se'].round(3) - df['se_upper_limit'] = df['se_upper_limit'].round(3) - - result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') - - if not result.empty: - print("se must be in (0,min(50,val*(1+eps))]") - sys.exit() - -def check_bad_sample_size(df): - if(df['sample_size'].isnull.values.any() | df['effective_sample_size'].isnull.values.any()): - print("sample size can't be NA") - sys.exit() - - qresult = df.query('(sample_size < 100) | (effective_sample_size < 100)') - - if not qresult.empty: - print("sample size must be >= 100") - sys.exit() - - -def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): - if (max_weighted_date < generation_date - timedelta(days=4) - or max_date < generation_date - timedelta(days=1)): - sys.exit("latest date of generated file seems too long ago") - return - -def reldiff_by_min(x, y): - return (x - y) / min(x,y) - -def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo): - recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] - recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list) - - if(abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): - print("Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") - print("The suspicous spike is for date: ", checking_date, ", signal: ", sig, ", geo_type: ", geo) - - -def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): - print("recent_df dtypes", recent_df.dtypes) - recent_df = recent_df.drop(columns=['geo_id']) - mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean() - recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() - recent_api_df = recent_api_df.drop(columns=['geo_value']) - - mean_recent_api_df = recent_api_df.mean() - - mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) - mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) - print("mean_stddiff", mean_stddiff) - print("mean_stdabsdiff", mean_stdabsdiff) - print("type(mean_stdabsdiff)",type(mean_stdabsdiff)) - - classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] - raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) - - smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) - - # Code reference from R code - # changesum.by.variable.with.flags = changesum.by.variable %>>% - # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | - # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]], - # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>% - # Todo - Check whats the purpose of variable=="val" in the above statement - - switcher = { - 'raw': raw_thresholds, - 'smoothed': smoothed_thresholds, - } - # Get the function from switcher dictionary - thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - - print(np.absolute(mean_stddiff) > thres.loc['mean.stddiff']) - mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() - mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool() - - - if mean_stddiff_high or mean_stdabsdiff_high: - print('Average differences in variables by geoid between recent & semirecent data seem' \ - + 'large --- either large increase tending toward one direction or large mean absolute' \ - + 'difference, relative to average values of corresponding variables. For the former' \ - + 'check, tolerances for `val` are more restrictive than those for other columns.') - - -def fbsurvey_validation(daily_filenames, sdate, edate, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): - - ## The following dates refer to the newly generated files from latest pipeline execution - ######----start_date-----####### - ######----end_date------####### - #start_date = date(2020, 6, 16) - #end_date = date(2020, 6, 19) - delta_days = (edate + timedelta(days=1)) - sdate - print("Number of days: ", delta_days.days) - date_list = [sdate + timedelta(days=x) for x in range(delta_days.days)] - print(date_list) - date_slist = [dt.strftime("%Y%m%d") for dt in date_list] - print(date_slist) - - filenames, geo_sig = get_filenames_with_geo_signal(data_folder, date_slist) - - data_folder = Path("../data") - - # Read metadata from filename - ## Red ALERT: Repeating filenames - sys.exit() - # - - - # Multi-indexed dataframe for a given (signal, geo_type) - - ## recent_lookbehind: start from the check date and working backward in time, - ## how many days do we include in the window of date to check for anomalies? - ## Choosing 1 day checks just the check data itself. - recent_lookbehind = timedelta(days=1) - - ## semirecent_lookbehind: starting from the check date and working backward - ## in time, how many days -- before subtracting out the "recent" days --- - ## do we use to form the reference statistics? - semirecent_lookbehind = timedelta(days=7) - smooth_option_regex = re.compile(r'([^_]+)') - - kroc = 0 - for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): - - m = smooth_option_regex.match(sig) - smooth_option = m.group(1) - - #recent_df.set_index("time_value", inplace = True) - print("Printing recent_df scenes:", recent_df.shape) - print(recent_df) - for checking_date in date_list: - #print(recent_df.loc[checking_date,:]) - # -recent- dataframe run backwards from the checking_date - recent_end_date = checking_date - timedelta(days=1) - recent_begin_date = checking_date - max_check_lookbehind - recent_api_df = covidcast.signal(DATA_SOURCE, sig, recent_begin_date, recent_end_date, geo) - - recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) - recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) - - column_names = ["geo_value", "val", "se", "sample_size", "time_value"] - - recent_api_df = recent_api_df.reindex(columns=column_names) - if (recent_df["se"].isnull().mean() > 0.5): - print('Recent se values are >50% NA') - - if sanity_check_rows_per_day: - check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) - - if sanity_check_value_diffs: - check_avg_val_diffs(recent_df, recent_api_df, smooth_option) - kroc += 1 - if kroc == 2: - break - sys.exit() - - check_missing_dates(daily_filenames, sdate, edate) - - # Examples: - # raw_cli - # raw_ili - # raw_wcli - # raw_wili - # raw_hh_cmnty_cli - # raw_nohh_cmnty_cli - - # try: - # df_ref = fetch_daily_data(DATA_SOURCE, survey_date, geo_type, signal) - # except APIDataFetchError as e: - # print("APIDataFetchError:", e) - # print("\n") - - check_min_allowed_max_date(generation_date, max_date, max_weighted_date) - check_bad_geo_id(df_to_test, geo_type) - check_bad_val(df_to_test) - check_bad_se(df_to_test) - check_bad_sample_size(df_to_test) - diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 220116468..6536b5059 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -7,15 +7,12 @@ import numpy as np import pandas as pd from delphi_utils import read_params -from .fbsurveyvalidation import fbsurvey_validation -from .datafetcher import * +from .validate import validate +from .datafetcher import read_filenames def run_module(): - - params = read_params() - - + params = read_params()["validation"] dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') max_check_lookbehind = int(params["ref_window_size"]) @@ -23,4 +20,4 @@ def run_module(): # Collecting all filenames daily_filnames = read_filenames(params["data_folder"]) - fbsurvey_validation(daily_filnames, dtobj_sdate, dtobj_edate) + validate(daily_filnames, dtobj_sdate, dtobj_edate) diff --git a/validator/params.json.template b/validator/params.json.template index 97daa91c6..e70b5ce48 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,8 +1,10 @@ { + "validation": { "data_source": "fb_survey", - "pipeline_version": "old", "start_date": "2020-06-13", "end_date": "2020-06-19", "ref_window_size": 7, - "data_folder": "C:/Covidcast/covidcast-indicators/validator/data" + "data_folder": "C:/Covidcast/covidcast-indicators/validator/data", + "minimum_sample_size": 100 + } } \ No newline at end of file From 59a2313c798bece028d6a4a731e6bc5412be748e Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Fri, 4 Sep 2020 17:26:31 -0400 Subject: [PATCH 027/151] fixup! [wip] validation package\n*replace print/exit with raised exceptions --- validator/delphi_validator/validate.py | 237 +++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 validator/delphi_validator/validate.py diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py new file mode 100644 index 000000000..a6be89cc3 --- /dev/null +++ b/validator/delphi_validator/validate.py @@ -0,0 +1,237 @@ +import sys +import re +import pandas as pd +import numpy as np +from pathlib import Path +from itertools import product +from datetime import date, datetime, timedelta +from .datafetcher import * +import math + +negated_regex_dict = { + 'county': '^(?!\d{5}).*$', + 'hrr': '^(?!\d{1,3}).*$', + 'msa': '^(?!\d{5}).*$', + 'state': '^(?![A-Z]{2}).*$', + 'national': '(?!usa).*$' +} + +class ValidationError(Exception): + def __init__(self, expression, message): + self.expression = expression + self.message = message + +def make_date_filter(start_date, end_date): + start_code = int(start_date.strftime("%Y%m%d")) + end_code = int(end_date.strftime("%Y%m%d")) + def f(filename, match): + if not match: return False + code = int(match.groupdict()['date']) + return code > start_code and code < end_code + return f + +def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): + + # Perform some automated format and sanity checks of =df.to.test= + if(type(max_check_lookbehind) != int | len(str(max_check_look_behind) != 1)): + raise ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be length 1, integer type") + + if( not isinstance(generation_date, datetime.date) or generation_date > date.today()): + raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a length 1 Date that is not in the future.") + # example: 20200624_county_smoothed_nohh_cmnty_cli + + pattern_found = filename_regex.match(nameformat) + if (not nameformat or not pattern_found): + raise ValidationError(nameformat, 'nameformat ({nameformat}) not recognized') + +def check_bad_geo_id(df_to_test, geo_type): + if geo_type not in negated_regex_dict: + raise ValidationError(geo_type,"Unrecognized geo type") + + def find_all_unexpected_geo_ids(df_to_test, negated_regex): + unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] + if(len(unexpected_geos) > 0): + raise ValidationError(unexpected_geos,"Non-conforming geo_ids exist!") + + find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) + +def check_missing_dates(daily_filenames, sdate, edate): + number_of_dates = edate - sdate + timedelta(days=1) + date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} + unique_dates = set() + unique_dates_obj = set() + + for daily_filename in daily_filenames: + unique_dates.add(daily_filename[0:8]) + for unique_date in unique_dates: + newdate_obj = datetime.strptime(unique_date, '%Y%m%d') + unique_dates_obj.add(newdate_obj) + + check_dateholes = date_seq.difference(unique_dates_obj) + + if check_dateholes: + print("Missing dates are observed; if these dates are already in the API they would not be updated") + print(check_dateholes) + +def check_bad_val(df_to_test): + # if (not df_to_test[(df_to_test['val'] > 100)].empty): + # print("val column can't have any cell greater than 100") + # sys.exit() + if (df_to_test.isnull().values.any()): + raise ValidationError(None,"val column can't have any cell set to null") + if (not df_to_test[(df_to_test['val'] < 0)].empty): + raise ValidationError(None,"val column can't have any cell smaller than 0") + +def check_bad_se(df): + if (df['se'].isnull().values.any()): + raise ValidationError("se must not be NA") + + df.eval('se_upper_limit = (val * effective_sample_size + 50)/(effective_sample_size + 1)', inplace=True) + + df['se']= df['se'].round(3) + df['se_upper_limit'] = df['se_upper_limit'].round(3) + + result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + + if not result.empty: + raise ValidationError("se must be in (0,min(50,val*(1+eps))]") + +def check_bad_sample_size(df): + if(df['sample_size'].isnull.values.any() | df['effective_sample_size'].isnull.values.any()): + raise ValidationError("sample size can't be NA") + + qresult = df.query('(sample_size < 100) | (effective_sample_size < 100)') + + if not qresult.empty: + raise ValidationError("sample size must be >= 100") + +def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): + if (max_weighted_date < generation_date - timedelta(days=4) + or max_date < generation_date - timedelta(days=1)): + raise ValidationError("latest date of generated file seems too long ago") + +def reldiff_by_min(x, y): + return (x - y) / min(x,y) + +def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo): + recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] + recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list) + + if(abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): + raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") + +def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): + #print("recent_df dtypes", recent_df.dtypes) + recent_df = recent_df.drop(columns=['geo_id']) + mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean() + recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() + recent_api_df = recent_api_df.drop(columns=['geo_value']) + + mean_recent_api_df = recent_api_df.mean() + + mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) + mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) + #print("mean_stddiff", mean_stddiff) + #print("mean_stdabsdiff", mean_stdabsdiff) + #print("type(mean_stdabsdiff)",type(mean_stdabsdiff)) + + classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] + raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) + + smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) + + # Code reference from R code + # changesum.by.variable.with.flags = changesum.by.variable %>>% + # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | + # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]], + # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>% + # Todo - Check whats the purpose of variable=="val" in the above statement + + switcher = { + 'raw': raw_thresholds, + 'smoothed': smoothed_thresholds, + } + # Get the function from switcher dictionary + thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") + + #print(np.absolute(mean_stddiff) > thres.loc['mean.stddiff']) + mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() + mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool() + + + if mean_stddiff_high or mean_stdabsdiff_high: + raise ValidationError('Average differences in variables by geoid between recent & semirecent data seem' \ + + 'large --- either large increase tending toward one direction or large mean absolute' \ + + 'difference, relative to average values of corresponding variables. For the former' \ + + 'check, tolerances for `val` are more restrictive than those for other columns.') + +def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): + + export_files = read_filenames(export_dir) + date_filter = make_date_filter(start_date, end_date) + validate_files = [(f, m) for (f, m) in export_files if date_filter(f,m)] + + all_frames = [] + + # First, check file formats + check_missing_dates(validate_files, start_date, end_date) + for filename,match in validate_files: + df = load_csv(filename) + check_bad_geo_id(df, match.groupdict()['geo_type']) + check_bad_val(df) + check_bad_se(df) + check_bad_sample_size(df) + df['geo_type'] = match.groupdict()['geo_type'] + df['date'] = match.groupdict()['date'] + df['signal'] = match.groupdict()['signal'] + all_frames.append(df) + + # Multi-indexed dataframe for a given (signal, geo_type) + + ## recent_lookbehind: start from the check date and working backward in time, + ## how many days do we include in the window of date to check for anomalies? + ## Choosing 1 day checks just the check data itself. + recent_lookbehind = timedelta(days=1) + + ## semirecent_lookbehind: starting from the check date and working backward + ## in time, how many days -- before subtracting out the "recent" days --- + ## do we use to form the reference statistics? + semirecent_lookbehind = timedelta(days=7) + smooth_option_regex = re.compile(r'([^_]+)') + + kroc = 0 + for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + + m = smooth_option_regex.match(sig) + smooth_option = m.group(1) + + #recent_df.set_index("time_value", inplace = True) + print("Printing recent_df scenes:", recent_df.shape) + print(recent_df) + for checking_date in date_list: + #print(recent_df.loc[checking_date,:]) + # -recent- dataframe run backwards from the checking_date + recent_end_date = checking_date - timedelta(days=1) + recent_begin_date = checking_date - max_check_lookbehind + recent_api_df = covidcast.signal(DATA_SOURCE, sig, recent_begin_date, recent_end_date, geo) + + recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) + recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) + + column_names = ["geo_value", "val", "se", "sample_size", "time_value"] + + recent_api_df = recent_api_df.reindex(columns=column_names) + if (recent_df["se"].isnull().mean() > 0.5): + print('Recent se values are >50% NA') + + if sanity_check_rows_per_day: + check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) + + if sanity_check_value_diffs: + check_avg_val_diffs(recent_df, recent_api_df, smooth_option) + kroc += 1 + if kroc == 2: + break + sys.exit() + + From 5572ef8ca90df1c673565b15240326751230e70f Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Tue, 8 Sep 2020 15:09:32 -0400 Subject: [PATCH 028/151] Remove old-codebase method; add pandas loader --- validator/delphi_validator/datafetcher.py | 94 +++++++++-------------- validator/delphi_validator/run.py | 10 +-- 2 files changed, 40 insertions(+), 64 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index facda3f5e..a53d83372 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -4,72 +4,40 @@ import covidcast import pandas as pd from datetime import date, datetime, timedelta -from .errors import * +from .errors import APIDataFetchError import re from typing import List import json +filename_regex = re.compile(r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') -def get_filenames_with_geo_signal(path, date_slist: List[str]): - - if pipeline_version == 'new': - meta = covidcast.metadata() - fb_meta = meta[meta['data_source']==DATA_SOURCE] - unique_signals = fb_meta['signal'].unique().tolist() - unique_geotypes = fb_meta['geo_type'].unique().tolist() - - - ##### Currently metadata returns --*community*-- signals that don't get generated - ##### in the new fb-pipeline. Seiving them out for now. - # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli - for sig in unique_signals: - if "community" in sig: - unique_signals.remove(sig) - - - geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) - print(geo_sig_cmbo) - print("Number of mixed types:", len(geo_sig_cmbo)) - - for cmb in geo_sig_cmbo: - print(cmb) - - - filenames = read_relevant_date_filenames(data_folder, date_slist[0]) - - else: - sdate = date_slist[0] - filenames = [f for f in listdir(path) if isfile(join(path, f))] - - sdate_filenames = [fname for fname in filenames if fname.find(sdate) != -1] - - # example: 20200624_county_smoothed_nohh_cmnty_cli - filename_regex = re.compile(r'^(\d{8})_([a-z]+)_(raw\S*|smoothed\S*)[_?](w?)([ci]li).csv$') - geo_sig_cmbo = list() - for f in sdate_filenames: - - m = filename_regex.match(f) - if (not m.group(0)): - print('=nameformat= not recognized as a daily format') - - geo_type = m.group(2) - - - if m.group(4): # weighted data 'w' - signal = "".join([m.group(4), m.group(5)]) - signal = "_".join([m.group(3), signal]) - # max_weighted_date = survey_date - else: - signal = "_".join([m.group(3), m.group(5)]) - # max_date = survey_date - - geo_sig_cmbo.append((geo_type, signal)) +def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): + meta = covidcast.metadata() + source_meta = meta[meta['data_source']==data_source] + unique_signals = source_meta['signal'].unique().tolist() + unique_geotypes = source_meta['geo_type'].unique().tolist() + + ##### Currently metadata returns --*community*-- signals that don't get generated + ##### in the new fb-pipeline. Seiving them out for now. + # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli + for sig in unique_signals: + if "community" in sig: + unique_signals.remove(sig) + + geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) + print(geo_sig_cmbo) + print("Number of mixed types:", len(geo_sig_cmbo)) + + for cmb in geo_sig_cmbo: + print(cmb) + + filenames = read_relevant_date_filenames(data_folder, date_slist[0]) return filenames, geo_sig_cmbo def read_filenames(path): - daily_filenames = [f for f in listdir(path) if isfile(join(path, f))] + daily_filenames = [ (f, filename_regex.match(f)) for f in listdir(path) if isfile(join(path, f))] return daily_filenames def read_relevant_date_filenames(data_path, date_slist): @@ -80,7 +48,7 @@ def read_relevant_date_filenames(data_path, date_slist): for dt in date_slist: if fl.find(dt) != -1: filenames.append(fl) - return filenames + return filenames def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): for geo_sig in geo_sig_cmbo: @@ -105,6 +73,16 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): df_list.append(df) yield pd.concat(df_list), geo_sig[0], geo_sig[1] +def load_csv(path): + return pd.read_csv( + path, + dtype={ + 'geo_id': str, + 'val': float, + 'se': float, + 'sample_size': float, + }) + def fetch_daily_data(data_source, survey_date, geo_type, signal): data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) if not isinstance(data_to_validate, pd.DataFrame): @@ -114,4 +92,4 @@ def fetch_daily_data(data_source, survey_date, geo_type, signal): ", geography-type:" + geo_type raise APIDataFetchError(custom_msg) return data_to_validate - \ No newline at end of file + diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 6536b5059..55ab1bb9e 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -4,6 +4,7 @@ This module should contain a function called `run_module`, that is executed when the module is run with `python -m delphi_validator`. """ +from datetime import datetime import numpy as np import pandas as pd from delphi_utils import read_params @@ -12,12 +13,9 @@ def run_module(): - params = read_params()["validation"] + parent_params = read_params() + params = parent_params['validation'] dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') max_check_lookbehind = int(params["ref_window_size"]) - - # Collecting all filenames - daily_filnames = read_filenames(params["data_folder"]) - - validate(daily_filnames, dtobj_sdate, dtobj_edate) + validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate) From 8eb1f6a140b99a880041d5f95b95db7212cae70a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 9 Sep 2020 16:44:05 -0400 Subject: [PATCH 029/151] [wip] Added linting template info to README. Replaced instances of effective_sample_size, where occurring alone, with sample_size; removed where occuring with sample_size. Fixed ValidationError calls in validate.py. Fixed various bugs in check_bad_ geo_id, val, se, and sample_size. --- validator/README.md | 12 ++++++++++++ validator/delphi_validator/validate.py | 25 +++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/validator/README.md b/validator/README.md index 3591bb0e1..bcac88fc1 100644 --- a/validator/README.md +++ b/validator/README.md @@ -41,3 +41,15 @@ and (optionally) remove the environment itself. deactivate rm -r env ``` + +## Testing the code + +To do a static test of the code style, it is recommended to run **pylint** on +the module. To do this, run the following from the main module directory: + +``` +env/bin/pylint delphi_validator +``` + +The most aggressive checks are turned off; only relatively important issues +should be raised and they should be manually checked (or better, fixed). \ No newline at end of file diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index a6be89cc3..68c13e641 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -1,4 +1,5 @@ import sys +import os import re import pandas as pd import numpy as np @@ -62,7 +63,7 @@ def check_missing_dates(daily_filenames, sdate, edate): unique_dates_obj = set() for daily_filename in daily_filenames: - unique_dates.add(daily_filename[0:8]) + unique_dates.add(daily_filename[0][0:8]) for unique_date in unique_dates: newdate_obj = datetime.strptime(unique_date, '%Y%m%d') unique_dates_obj.add(newdate_obj) @@ -84,9 +85,9 @@ def check_bad_val(df_to_test): def check_bad_se(df): if (df['se'].isnull().values.any()): - raise ValidationError("se must not be NA") + raise ValidationError(None, "se must not be NA") - df.eval('se_upper_limit = (val * effective_sample_size + 50)/(effective_sample_size + 1)', inplace=True) + df.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) df['se']= df['se'].round(3) df['se_upper_limit'] = df['se_upper_limit'].round(3) @@ -94,21 +95,21 @@ def check_bad_se(df): result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - raise ValidationError("se must be in (0,min(50,val*(1+eps))]") + raise ValidationError(None, "se must be in (0,min(50,val*(1+eps))]") def check_bad_sample_size(df): - if(df['sample_size'].isnull.values.any() | df['effective_sample_size'].isnull.values.any()): - raise ValidationError("sample size can't be NA") + if(df['sample_size'].isnull().values.any()): + raise ValidationError(None, "sample size can't be NA") - qresult = df.query('(sample_size < 100) | (effective_sample_size < 100)') + qresult = df.query('(sample_size < 100)') if not qresult.empty: - raise ValidationError("sample size must be >= 100") + raise ValidationError(None, "sample size must be >= 100") def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): if (max_weighted_date < generation_date - timedelta(days=4) or max_date < generation_date - timedelta(days=1)): - raise ValidationError("latest date of generated file seems too long ago") + raise ValidationError(None, "latest date of generated file seems too long ago") def reldiff_by_min(x, y): return (x - y) / min(x,y) @@ -160,7 +161,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): if mean_stddiff_high or mean_stdabsdiff_high: - raise ValidationError('Average differences in variables by geoid between recent & semirecent data seem' \ + raise ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geoid between recent & semirecent data seem' \ + 'large --- either large increase tending toward one direction or large mean absolute' \ + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') @@ -175,8 +176,8 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( # First, check file formats check_missing_dates(validate_files, start_date, end_date) - for filename,match in validate_files: - df = load_csv(filename) + for filename, match in validate_files: + df = load_csv(os.path.join(export_dir, filename)) check_bad_geo_id(df, match.groupdict()['geo_type']) check_bad_val(df) check_bad_se(df) From 7719f03097bc81540d48d04fa7f3a3971d78dede Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 10 Sep 2020 11:41:03 -0400 Subject: [PATCH 030/151] [wip] Defined function to get geo_sig_combo outside context of get_filenames_with_geo_signal. Removed duplicate code in datafetcher.py. Defined geo_sig_combo and date lists for validate(). Added TODOs identifying improvements. --- validator/delphi_validator/datafetcher.py | 22 ++++++++++++++-------- validator/delphi_validator/validate.py | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index a53d83372..b6dba067d 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -8,6 +8,7 @@ import re from typing import List import json +from itertools import product filename_regex = re.compile(r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') @@ -36,6 +37,16 @@ def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): return filenames, geo_sig_cmbo +def get_geo_sig_cmbo(df): + unique_signals = df['signal'].unique().tolist() + unique_geotypes = df['geo_type'].unique().tolist() + + geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) + print("Number of mixed types:", len(geo_sig_cmbo)) + + return geo_sig_cmbo + + def read_filenames(path): daily_filenames = [ (f, filename_regex.match(f)) for f in listdir(path) if isfile(join(path, f))] return daily_filenames @@ -57,15 +68,10 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): files = list(filter(lambda x: geo_sig[0] in x and geo_sig[1] in x, filenames)) if(len(files) == 0): print("FILE_NOT_FOUND: File with geo_type:", geo_sig[0], " and signal:", geo_sig[1], " does not exist!") + yield pd.DataFrame(), geo_sig[0], geo_sig[1] + continue for f in files: - df = pd.read_csv( - data_folder / f, - dtype={'geo_id': str, - 'val': float, - 'se': float, - 'sample_size': float, - 'effective_sample_size': float - }) + df = load_csv(join(data_folder, f)) for dt in date_slist: if f.find(dt) != -1: gen_dt = datetime.strptime(dt, '%Y%m%d') diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 68c13e641..0aed4e462 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -1,5 +1,5 @@ import sys -import os +from os.path import join import re import pandas as pd import numpy as np @@ -106,6 +106,7 @@ def check_bad_sample_size(df): if not qresult.empty: raise ValidationError(None, "sample size must be >= 100") +# Not currently checked. def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): if (max_weighted_date < generation_date - timedelta(days=4) or max_date < generation_date - timedelta(days=1)): @@ -177,7 +178,7 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( # First, check file formats check_missing_dates(validate_files, start_date, end_date) for filename, match in validate_files: - df = load_csv(os.path.join(export_dir, filename)) + df = load_csv(join(export_dir, filename)) check_bad_geo_id(df, match.groupdict()['geo_type']) check_bad_val(df) check_bad_se(df) @@ -187,7 +188,12 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( df['signal'] = match.groupdict()['signal'] all_frames.append(df) - # Multi-indexed dataframe for a given (signal, geo_type) + # TODO: Multi-indexed dataframe for a given (signal, geo_type) + all_frames = pd.concat(all_frames) + + geo_sig_cmbo = get_geo_sig_cmbo(all_frames) + date_slist = df['date'].unique().tolist() + date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) ## recent_lookbehind: start from the check date and working backward in time, ## how many days do we include in the window of date to check for anomalies? @@ -201,7 +207,9 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( smooth_option_regex = re.compile(r'([^_]+)') kroc = 0 - for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + + # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). + for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, validate_files, date_slist): m = smooth_option_regex.match(sig) smooth_option = m.group(1) From 147401ad4af49f02935ec62cc169149faec2902c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 10 Sep 2020 14:30:04 -0400 Subject: [PATCH 031/151] [wip] Fetch data_source from params.json. Added as argument to validate() --- validator/delphi_validator/run.py | 3 ++- validator/delphi_validator/validate.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 55ab1bb9e..79ae10b7b 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -18,4 +18,5 @@ def run_module(): dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') max_check_lookbehind = int(params["ref_window_size"]) - validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate) + data_source = params['data_source'] + validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate, data_source) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 0aed4e462..b2518dd57 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -167,7 +167,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') -def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): +def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): export_files = read_filenames(export_dir) date_filter = make_date_filter(start_date, end_date) @@ -179,10 +179,10 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( check_missing_dates(validate_files, start_date, end_date) for filename, match in validate_files: df = load_csv(join(export_dir, filename)) - check_bad_geo_id(df, match.groupdict()['geo_type']) - check_bad_val(df) - check_bad_se(df) - check_bad_sample_size(df) + # check_bad_geo_id(df, match.groupdict()['geo_type']) + # check_bad_val(df) + # check_bad_se(df) + # check_bad_sample_size(df) df['geo_type'] = match.groupdict()['geo_type'] df['date'] = match.groupdict()['date'] df['signal'] = match.groupdict()['signal'] @@ -190,7 +190,8 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( # TODO: Multi-indexed dataframe for a given (signal, geo_type) all_frames = pd.concat(all_frames) - + + # TODO: Should be checking covidcast.meta() for all geo-sig combos to see if any CSVs are missing. geo_sig_cmbo = get_geo_sig_cmbo(all_frames) date_slist = df['date'].unique().tolist() date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) @@ -220,9 +221,9 @@ def validate(export_dir, start_date, end_date, max_check_lookbehind = timedelta( for checking_date in date_list: #print(recent_df.loc[checking_date,:]) # -recent- dataframe run backwards from the checking_date - recent_end_date = checking_date - timedelta(days=1) + recent_end_date = checking_date - recent_lookbehind recent_begin_date = checking_date - max_check_lookbehind - recent_api_df = covidcast.signal(DATA_SOURCE, sig, recent_begin_date, recent_end_date, geo) + recent_api_df = covidcast.signal(data_source, sig, recent_begin_date, recent_end_date, geo) recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) From 8157a6b8d2ee54231e7743fbb63faff9e71a5fb8 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 10 Sep 2020 15:06:22 -0400 Subject: [PATCH 032/151] [wip] get_geo_sig_cmbo() now encapsulates covidcast metadata call --- validator/delphi_validator/datafetcher.py | 32 ++++++++++------------- validator/delphi_validator/validate.py | 9 ++++--- validator/params.json.template | 3 +-- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index b6dba067d..fdc92aaf8 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -14,21 +14,7 @@ def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): - meta = covidcast.metadata() - source_meta = meta[meta['data_source']==data_source] - unique_signals = source_meta['signal'].unique().tolist() - unique_geotypes = source_meta['geo_type'].unique().tolist() - - ##### Currently metadata returns --*community*-- signals that don't get generated - ##### in the new fb-pipeline. Seiving them out for now. - # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli - for sig in unique_signals: - if "community" in sig: - unique_signals.remove(sig) - - geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) - print(geo_sig_cmbo) - print("Number of mixed types:", len(geo_sig_cmbo)) + geo_sig_cmbo = get_geo_sig_cmbo(data_source) for cmb in geo_sig_cmbo: print(cmb) @@ -37,10 +23,20 @@ def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): return filenames, geo_sig_cmbo -def get_geo_sig_cmbo(df): - unique_signals = df['signal'].unique().tolist() - unique_geotypes = df['geo_type'].unique().tolist() +def get_geo_sig_cmbo(data_source): + meta = covidcast.metadata() + source_meta = meta[meta['data_source']==data_source] + unique_signals = source_meta['signal'].unique().tolist() + unique_geotypes = source_meta['geo_type'].unique().tolist() + if data_source == 'fb-survey': + ##### Currently metadata returns --*community*-- signals that don't get generated + ##### in the new fb-pipeline. Seiving them out for now. + # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli + for sig in unique_signals: + if "community" in sig: + unique_signals.remove(sig) + geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) print("Number of mixed types:", len(geo_sig_cmbo)) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index b2518dd57..db3d6c216 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -191,8 +191,7 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind # TODO: Multi-indexed dataframe for a given (signal, geo_type) all_frames = pd.concat(all_frames) - # TODO: Should be checking covidcast.meta() for all geo-sig combos to see if any CSVs are missing. - geo_sig_cmbo = get_geo_sig_cmbo(all_frames) + geo_sig_cmbo = get_geo_sig_cmbo(data_source) date_slist = df['date'].unique().tolist() date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) @@ -205,13 +204,15 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind ## in time, how many days -- before subtracting out the "recent" days --- ## do we use to form the reference statistics? semirecent_lookbehind = timedelta(days=7) + + smooth_option_regex = re.compile(r'([^_]+)') kroc = 0 # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, validate_files, date_slist): - + m = smooth_option_regex.match(sig) smooth_option = m.group(1) @@ -239,6 +240,8 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind if sanity_check_value_diffs: check_avg_val_diffs(recent_df, recent_api_df, smooth_option) + + # TODO: Add semirecent check? kroc += 1 if kroc == 2: break diff --git a/validator/params.json.template b/validator/params.json.template index e70b5ce48..f933086d2 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,10 +1,9 @@ { "validation": { - "data_source": "fb_survey", + "data_source": "fb-survey", "start_date": "2020-06-13", "end_date": "2020-06-19", "ref_window_size": 7, - "data_folder": "C:/Covidcast/covidcast-indicators/validator/data", "minimum_sample_size": 100 } } \ No newline at end of file From 00a249abecbf290f32c0cee6fabcce4530a03de8 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Sep 2020 11:05:12 -0400 Subject: [PATCH 033/151] Set alternative to smooth_option for non-FB data --- validator/delphi_validator/datafetcher.py | 2 +- validator/delphi_validator/validate.py | 24 ++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index fdc92aaf8..3566b70d1 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -62,7 +62,7 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): df_list = list() files = list(filter(lambda x: geo_sig[0] in x and geo_sig[1] in x, filenames)) - if(len(files) == 0): + if (len(files) == 0): print("FILE_NOT_FOUND: File with geo_type:", geo_sig[0], " and signal:", geo_sig[1], " does not exist!") yield pd.DataFrame(), geo_sig[0], geo_sig[1] continue diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index db3d6c216..1202b13cd 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -123,7 +123,7 @@ def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): - #print("recent_df dtypes", recent_df.dtypes) + # TODO: something is wrong with this check definition. recent_df = recent_df.drop(columns=['geo_id']) mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean() recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() @@ -133,15 +133,12 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) - #print("mean_stddiff", mean_stddiff) - #print("mean_stdabsdiff", mean_stdabsdiff) - #print("type(mean_stdabsdiff)",type(mean_stdabsdiff)) classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) - + # Code reference from R code # changesum.by.variable.with.flags = changesum.by.variable %>>% # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | @@ -162,7 +159,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): if mean_stddiff_high or mean_stdabsdiff_high: - raise ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geoid between recent & semirecent data seem' \ + raise ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & semirecent data seem' \ + 'large --- either large increase tending toward one direction or large mean absolute' \ + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') @@ -179,10 +176,10 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind check_missing_dates(validate_files, start_date, end_date) for filename, match in validate_files: df = load_csv(join(export_dir, filename)) - # check_bad_geo_id(df, match.groupdict()['geo_type']) - # check_bad_val(df) - # check_bad_se(df) - # check_bad_sample_size(df) + check_bad_geo_id(df, match.groupdict()['geo_type']) + check_bad_val(df) + check_bad_se(df) + check_bad_sample_size(df) df['geo_type'] = match.groupdict()['geo_type'] df['date'] = match.groupdict()['date'] df['signal'] = match.groupdict()['signal'] @@ -195,6 +192,8 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind date_slist = df['date'].unique().tolist() date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) + filenames = [name_match_pair[0] for name_match_pair in validate_files] + ## recent_lookbehind: start from the check date and working backward in time, ## how many days do we include in the window of date to check for anomalies? ## Choosing 1 day checks just the check data itself. @@ -211,10 +210,13 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind kroc = 0 # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). - for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, validate_files, date_slist): + for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): m = smooth_option_regex.match(sig) smooth_option = m.group(1) + + if smooth_option not in ('raw', 'smoothed'): + smooth_option = 'smoothed' if '7dav' in sig else 'raw' #recent_df.set_index("time_value", inplace = True) print("Printing recent_df scenes:", recent_df.shape) From d9cfa30662d74227bb8ab212b40ed4c52dd89035 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Sep 2020 13:07:36 -0400 Subject: [PATCH 034/151] [wip] Runs but some check definitions are wrong --- validator/delphi_validator/validate.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 1202b13cd..af3ad9fed 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -9,6 +9,8 @@ from .datafetcher import * import math +import pdb + negated_regex_dict = { 'county': '^(?!\d{5}).*$', 'hrr': '^(?!\d{1,3}).*$', @@ -30,7 +32,8 @@ def f(filename, match): code = int(match.groupdict()['date']) return code > start_code and code < end_code return f - + +# TODO: not used. def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): # Perform some automated format and sanity checks of =df.to.test= @@ -39,8 +42,8 @@ def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_c if( not isinstance(generation_date, datetime.date) or generation_date > date.today()): raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a length 1 Date that is not in the future.") + # example: 20200624_county_smoothed_nohh_cmnty_cli - pattern_found = filename_regex.match(nameformat) if (not nameformat or not pattern_found): raise ValidationError(nameformat, 'nameformat ({nameformat}) not recognized') @@ -123,6 +126,8 @@ def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): + # pdb.set_trace() + # TODO: something is wrong with this check definition. recent_df = recent_df.drop(columns=['geo_id']) mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean() @@ -153,8 +158,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): # Get the function from switcher dictionary thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - #print(np.absolute(mean_stddiff) > thres.loc['mean.stddiff']) - mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() + mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() # or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool() From e491f7cefcab6e425730dbe29b63d519a85b1247 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 15 Sep 2020 18:21:48 -0400 Subject: [PATCH 035/151] Added docstring and comments to validate_daily(). Checked correctness against reference code. --- validator/delphi_validator/validate.py | 32 +++++++++++++++++++++----- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index af3ad9fed..388df12db 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -20,6 +20,7 @@ } class ValidationError(Exception): + """ Error raised when validation check fails. """ def __init__(self, expression, message): self.expression = expression self.message = message @@ -33,21 +34,40 @@ def f(filename, match): return code > start_code and code < end_code return f -# TODO: not used. -def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): +# TODO: not used. Several arguments not used and should probably be moved to validate(). +def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = False, check_vs_working = True): + """ + Perform some automated format & sanity checks of inputs. - # Perform some automated format and sanity checks of =df.to.test= - if(type(max_check_lookbehind) != int | len(str(max_check_look_behind) != 1)): + Arguments: + - df_to_test: pandas dataframe + - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli" + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - sanity_check_rows_per_day + - sanity_check_value_diffs: detects false positives most of the time, defaults to False + - check_vs_working + + Returns: + - None + """ + + if (type(max_check_lookbehind) != int or len(str(max_check_look_behind) != 1)): raise ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be length 1, integer type") if( not isinstance(generation_date, datetime.date) or generation_date > date.today()): - raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a length 1 Date that is not in the future.") + raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a datetime.date type and not in the future.") - # example: 20200624_county_smoothed_nohh_cmnty_cli pattern_found = filename_regex.match(nameformat) if (not nameformat or not pattern_found): raise ValidationError(nameformat, 'nameformat ({nameformat}) not recognized') + if not isinstance(df_to_test, pd.DataFrame): + raise ValidationError(nameformat, 'df_to_test must be a pandas dataframe.') + + # TODO: check column names and types in df_to_test. Currently skipped since load_csv() specifies field names and types on read. Extra columns will simply be ignored during later processing. + + def check_bad_geo_id(df_to_test, geo_type): if geo_type not in negated_regex_dict: raise ValidationError(geo_type,"Unrecognized geo type") From ae160377e1e3504334c3cebc03332d5548065f41 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 15 Sep 2020 18:31:23 -0400 Subject: [PATCH 036/151] Initial docstring in validate(). validate_daily() now called in validate() --- validator/delphi_validator/validate.py | 29 +++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 388df12db..efafa049e 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -34,19 +34,16 @@ def f(filename, match): return code > start_code and code < end_code return f -# TODO: not used. Several arguments not used and should probably be moved to validate(). -def validate_daily(df_to_test, nameformat, generation_date = date.today(), max_check_lookbehind = 7, sanity_check_rows_per_day = True, sanity_check_value_diffs = False, check_vs_working = True): +# TODO: generation_date not used and should probably be moved to validate(). +def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date = date.today()): """ Perform some automated format & sanity checks of inputs. Arguments: - df_to_test: pandas dataframe - - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli" - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - - sanity_check_rows_per_day - - sanity_check_value_diffs: detects false positives most of the time, defaults to False - - check_vs_working + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test Returns: - None @@ -188,7 +185,21 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') -def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = True): +def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = False, check_vs_working = True): + """ + Perform data checks. + + Arguments: + + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - sanity_check_rows_per_day + - sanity_check_value_diffs: detects false positives most of the time, defaults to False + - check_vs_working + + Returns: + - None + """ export_files = read_filenames(export_dir) date_filter = make_date_filter(start_date, end_date) @@ -200,6 +211,8 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind check_missing_dates(validate_files, start_date, end_date) for filename, match in validate_files: df = load_csv(join(export_dir, filename)) + + validate_daily(df, filename, max_check_lookbehind) check_bad_geo_id(df, match.groupdict()['geo_type']) check_bad_val(df) check_bad_se(df) From 2be6a787edd7770a54a778fa5bd3cf0b9af59502 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 16 Sep 2020 11:15:05 -0400 Subject: [PATCH 037/151] Debugged validate_daily(). Implemented and debugged check_min_allowed_max_date() and check_max_allowed_max_date() to match reference code.Both checks now used in validate(). --- validator/delphi_validator/validate.py | 36 +++++++++++++++++--------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index efafa049e..642256cf7 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -34,8 +34,7 @@ def f(filename, match): return code > start_code and code < end_code return f -# TODO: generation_date not used and should probably be moved to validate(). -def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date = date.today()): +def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date): """ Perform some automated format & sanity checks of inputs. @@ -49,10 +48,10 @@ def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date - None """ - if (type(max_check_lookbehind) != int or len(str(max_check_look_behind) != 1)): - raise ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be length 1, integer type") + if (not isinstance(max_check_lookbehind, timedelta)): + raise ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be of type datetime.timedelta") - if( not isinstance(generation_date, datetime.date) or generation_date > date.today()): + if( not isinstance(generation_date, date) or generation_date > date.today()): raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a datetime.date type and not in the future.") pattern_found = filename_regex.match(nameformat) @@ -126,12 +125,21 @@ def check_bad_sample_size(df): if not qresult.empty: raise ValidationError(None, "sample size must be >= 100") -# Not currently checked. -def check_min_allowed_max_date(generation_date, max_date, max_weighted_date): - if (max_weighted_date < generation_date - timedelta(days=4) - or max_date < generation_date - timedelta(days=1)): +def check_min_allowed_max_date(max_date, generation_date, weighted_option='unweighted'): + switcher = { + 'unweighted': timedelta(days=1), + 'weighted': timedelta(days=4) + } + # Get the function from switcher dictionary + thres = switcher.get(weighted_option, lambda: "Invalid weighting option") + + if (max_date < generation_date - thres): raise ValidationError(None, "latest date of generated file seems too long ago") +def check_max_allowed_max_date(max_date, generation_date): + if (max_date < generation_date - timedelta(days=1)): + raise ValidationError(None, "latest date of generated file seems too recent") + def reldiff_by_min(x, y): return (x - y) / min(x,y) @@ -185,7 +193,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') -def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind = timedelta(days=7), sanity_check_rows_per_day = True, sanity_check_value_diffs = False, check_vs_working = True): +def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind = timedelta(days=7), generation_date = date.today(), sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): """ Perform data checks. @@ -194,7 +202,7 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - sanity_check_rows_per_day - - sanity_check_value_diffs: detects false positives most of the time, defaults to False + - sanity_check_value_diffs: - check_vs_working Returns: @@ -207,12 +215,16 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind all_frames = [] + # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + check_max_allowed_max_date(end_date, generation_date) + # First, check file formats check_missing_dates(validate_files, start_date, end_date) for filename, match in validate_files: df = load_csv(join(export_dir, filename)) - validate_daily(df, filename, max_check_lookbehind) + validate_daily(df, filename, max_check_lookbehind, generation_date) check_bad_geo_id(df, match.groupdict()['geo_type']) check_bad_val(df) check_bad_se(df) From 0ff4fd7f34bee9526300d00ac8d619f2b570ac0e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 17 Sep 2020 10:03:54 -0400 Subject: [PATCH 038/151] Changed geo_id regex to use lowercase letters for state abbreviation --- validator/delphi_validator/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 642256cf7..e572f2c51 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -15,7 +15,7 @@ 'county': '^(?!\d{5}).*$', 'hrr': '^(?!\d{1,3}).*$', 'msa': '^(?!\d{5}).*$', - 'state': '^(?![A-Z]{2}).*$', + 'state': '^(?![a-z]{2}).*$', 'national': '(?!usa).*$' } From ccf80157b5e6770839765128e81b2ee36e40e927 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 17 Sep 2020 11:00:53 -0400 Subject: [PATCH 039/151] Added check in check_bad_val for proprtion maximum allowed value. Added docstrings to this and geo id check --- validator/delphi_validator/datafetcher.py | 2 +- validator/delphi_validator/validate.py | 56 ++++++++++++++++------- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 3566b70d1..608efac05 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -32,7 +32,7 @@ def get_geo_sig_cmbo(data_source): if data_source == 'fb-survey': ##### Currently metadata returns --*community*-- signals that don't get generated ##### in the new fb-pipeline. Seiving them out for now. - # Todo - Include weighted whh_cmnty_cli and wnohh_cmnty_cli + # TODO: Include weighted whh_cmnty_cli and wnohh_cmnty_cli for sig in unique_signals: if "community" in sig: unique_signals.remove(sig) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index e572f2c51..1fe962f1c 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -39,7 +39,7 @@ def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date Perform some automated format & sanity checks of inputs. Arguments: - - df_to_test: pandas dataframe + - df_to_test: pandas dataframe of CSV source data - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test @@ -65,6 +65,16 @@ def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date def check_bad_geo_id(df_to_test, geo_type): + """ + Check validity of geo type and values, according to regex pattern. + + Arguments: + - df_to_test: pandas dataframe of CSV source data + - geo_type: string from CSV name specifying geo type (state, county, etc) of data + + Returns: + - None + """ if geo_type not in negated_regex_dict: raise ValidationError(geo_type,"Unrecognized geo type") @@ -93,34 +103,48 @@ def check_missing_dates(daily_filenames, sdate, edate): print("Missing dates are observed; if these dates are already in the API they would not be updated") print(check_dateholes) -def check_bad_val(df_to_test): - # if (not df_to_test[(df_to_test['val'] > 100)].empty): - # print("val column can't have any cell greater than 100") - # sys.exit() +def check_bad_val(df_to_test, signal_type): + """ + Check value field for validity. + + Arguments: + - df_to_test: pandas dataframe of CSV source data + - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data + + Returns: + - None + """ + proportion_option = True if 'prop' in signal_type or 'pct' in signal_type else False + + if proportion_option: + if (not df_to_test[(df_to_test['val'] > 100)].empty): + raise ValidationError(None,"val column can't have any cell greater than 100") + if (df_to_test.isnull().values.any()): raise ValidationError(None,"val column can't have any cell set to null") + if (not df_to_test[(df_to_test['val'] < 0)].empty): raise ValidationError(None,"val column can't have any cell smaller than 0") -def check_bad_se(df): - if (df['se'].isnull().values.any()): +def check_bad_se(df_to_test): + if (df_to_test['se'].isnull().values.any()): raise ValidationError(None, "se must not be NA") - df.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) + df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) - df['se']= df['se'].round(3) - df['se_upper_limit'] = df['se_upper_limit'].round(3) + df_to_test['se']= df_to_test['se'].round(3) + df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) - result = df.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: raise ValidationError(None, "se must be in (0,min(50,val*(1+eps))]") -def check_bad_sample_size(df): - if(df['sample_size'].isnull().values.any()): +def check_bad_sample_size(df_to_test): + if(df_to_test['sample_size'].isnull().values.any()): raise ValidationError(None, "sample size can't be NA") - qresult = df.query('(sample_size < 100)') + qresult = df_to_test.query('(sample_size < 100)') if not qresult.empty: raise ValidationError(None, "sample size must be >= 100") @@ -226,7 +250,7 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind validate_daily(df, filename, max_check_lookbehind, generation_date) check_bad_geo_id(df, match.groupdict()['geo_type']) - check_bad_val(df) + check_bad_val(df, match.groupdict()['signal']) check_bad_se(df) check_bad_sample_size(df) df['geo_type'] = match.groupdict()['geo_type'] @@ -265,7 +289,7 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind smooth_option = m.group(1) if smooth_option not in ('raw', 'smoothed'): - smooth_option = 'smoothed' if '7dav' in sig else 'raw' + smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' #recent_df.set_index("time_value", inplace = True) print("Printing recent_df scenes:", recent_df.shape) From 0b9d39e7a51bc07249d1b075033e374c2351002f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 17 Sep 2020 11:37:35 -0400 Subject: [PATCH 040/151] Changed check_bad_se to allow for NA standard errors. Added docstrings --- validator/delphi_validator/validate.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 1fe962f1c..5fdeadc30 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -127,18 +127,24 @@ def check_bad_val(df_to_test, signal_type): raise ValidationError(None,"val column can't have any cell smaller than 0") def check_bad_se(df_to_test): - if (df_to_test['se'].isnull().values.any()): - raise ValidationError(None, "se must not be NA") + """ + Check standard errors for validity. + Arguments: + - df_to_test: pandas dataframe of CSV source data + + Returns: + - None + """ df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) df_to_test['se']= df_to_test['se'].round(3) df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) - result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + result = df_to_test.query('~(~se.isnull() & (((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: - raise ValidationError(None, "se must be in (0,min(50,val*(1+eps))]") + raise ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]") def check_bad_sample_size(df_to_test): if(df_to_test['sample_size'].isnull().values.any()): From 51302e3cbdd63dcb80d7f00de2ddd5c4e46f4788 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 22 Sep 2020 11:07:29 -0400 Subject: [PATCH 041/151] Added general params argument to validate() to encapsulate miscellaneous optional user settings. Optional settings now fetched or default set at beginning of validate(). Fixed check_min/max_allowed_max_date() error. Added two different check paths to check_bad_se() based on user setting. Added check that se should be non-zero when val is 0 to address Quidel data issue (#255) --- validator/delphi_validator/run.py | 9 +++--- validator/delphi_validator/validate.py | 44 ++++++++++++++++++++------ 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 79ae10b7b..4b3ee32db 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -15,8 +15,9 @@ def run_module(): parent_params = read_params() params = parent_params['validation'] - dtobj_sdate = datetime.strptime(params['start_date'], '%Y-%m-%d') - dtobj_edate = datetime.strptime(params['end_date'], '%Y-%m-%d') - max_check_lookbehind = int(params["ref_window_size"]) + data_source = params['data_source'] - validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate, data_source) + dtobj_sdate = datetime.date(datetime.strptime(params['start_date'], '%Y-%m-%d')) + dtobj_edate = datetime.date(datetime.strptime(params['end_date'], '%Y-%m-%d')) + + validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate, data_source, params) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 5fdeadc30..4ab214e0c 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -126,12 +126,13 @@ def check_bad_val(df_to_test, signal_type): if (not df_to_test[(df_to_test['val'] < 0)].empty): raise ValidationError(None,"val column can't have any cell smaller than 0") -def check_bad_se(df_to_test): +def check_bad_se(df_to_test, missing_se_allowed): """ Check standard errors for validity. Arguments: - df_to_test: pandas dataframe of CSV source data + - missing_se_allowed: boolean specified in params.json Returns: - None @@ -141,19 +142,34 @@ def check_bad_se(df_to_test): df_to_test['se']= df_to_test['se'].round(3) df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) - result = df_to_test.query('~(~se.isnull() & (((se > 0) & (se < 50) & (se <= se_upper_limit)))') + if not missing_se_allowed: + if (df_to_test['se'].isnull().values.any()): + raise ValidationError(None, "se must not be NA") + + result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + + if not result.empty: + raise ValidationError(None, "se must be in (0,min(50,val*(1+eps))]") + + elif missing_se_allowed: + result = df_to_test.query('~(~se.isnull() & (((se > 0) & (se < 50) & (se <= se_upper_limit)))') + + if not result.empty: + raise ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]") + + result = df_to_test.query('(val == 0) & (se == 0)') if not result.empty: - raise ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]") + raise ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se") -def check_bad_sample_size(df_to_test): +def check_bad_sample_size(df_to_test, minimum_sample_size): if(df_to_test['sample_size'].isnull().values.any()): raise ValidationError(None, "sample size can't be NA") - qresult = df_to_test.query('(sample_size < 100)') + qresult = df_to_test.query('(sample_size < @minimum_sample_size)') if not qresult.empty: - raise ValidationError(None, "sample size must be >= 100") + raise ValidationError(None, "sample size must be >= {minimum_sample_size}") def check_min_allowed_max_date(max_date, generation_date, weighted_option='unweighted'): switcher = { @@ -223,7 +239,7 @@ def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') -def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind = timedelta(days=7), generation_date = date.today(), sanity_check_rows_per_day = True, sanity_check_value_diffs = True, check_vs_working = True): +def validate(export_dir, start_date, end_date, data_source, params, generation_date = date.today()): """ Perform data checks. @@ -238,6 +254,16 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind Returns: - None """ + # Get user settings from params or if not provided, set default. + max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) + minimum_sample_size = params.get('minimum_sample_size', 100) + missing_se_allowed = params.get('missing_se_allowed', False) + missing_sample_size_allowed = params.get('missing_sample_size_allowed', False) + + sanity_check_rows_per_day = params.get('sanity_check_rows_per_day', True) + sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) + check_vs_working = params.get('check_vs_working', True) + export_files = read_filenames(export_dir) date_filter = make_date_filter(start_date, end_date) @@ -257,8 +283,8 @@ def validate(export_dir, start_date, end_date, data_source, max_check_lookbehind validate_daily(df, filename, max_check_lookbehind, generation_date) check_bad_geo_id(df, match.groupdict()['geo_type']) check_bad_val(df, match.groupdict()['signal']) - check_bad_se(df) - check_bad_sample_size(df) + check_bad_se(df, missing_se_allowed) + check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) df['geo_type'] = match.groupdict()['geo_type'] df['date'] = match.groupdict()['date'] df['signal'] = match.groupdict()['signal'] From e820f977eaf12dc6669b2fca81b077664a47f9c4 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 22 Sep 2020 12:18:54 -0400 Subject: [PATCH 042/151] Modified check_bad_sample_size() to allow user-defined minimum sample size and accept or reject missing sample sizes. Modfied query logic in se check. --- validator/delphi_validator/validate.py | 29 ++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 4ab214e0c..6fa7eed7c 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -118,10 +118,10 @@ def check_bad_val(df_to_test, signal_type): if proportion_option: if (not df_to_test[(df_to_test['val'] > 100)].empty): - raise ValidationError(None,"val column can't have any cell greater than 100") + raise ValidationError(signal_type, "val column can't have any cell greater than 100") - if (df_to_test.isnull().values.any()): - raise ValidationError(None,"val column can't have any cell set to null") + if (df_to_test['val'].isnull().values.any()): + raise ValidationError(None,"val column can't have any cell that is NA") if (not df_to_test[(df_to_test['val'] < 0)].empty): raise ValidationError(None,"val column can't have any cell smaller than 0") @@ -152,7 +152,7 @@ def check_bad_se(df_to_test, missing_se_allowed): raise ValidationError(None, "se must be in (0,min(50,val*(1+eps))]") elif missing_se_allowed: - result = df_to_test.query('~(~se.isnull() & (((se > 0) & (se < 50) & (se <= se_upper_limit)))') + result = df_to_test.query('~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: raise ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]") @@ -162,14 +162,21 @@ def check_bad_se(df_to_test, missing_se_allowed): if not result.empty: raise ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se") -def check_bad_sample_size(df_to_test, minimum_sample_size): - if(df_to_test['sample_size'].isnull().values.any()): - raise ValidationError(None, "sample size can't be NA") - - qresult = df_to_test.query('(sample_size < @minimum_sample_size)') +def check_bad_sample_size(df_to_test, minimum_sample_size, missing_sample_size_allowed): + if not missing_sample_size_allowed: + if (df_to_test['sample_size'].isnull().values.any()): + raise ValidationError(None, "sample_size must not be NA") + + result = df_to_test.query('(sample_size < @minimum_sample_size)') + + if not result.empty: + raise ValidationError(None, "sample size must be >= {minimum_sample_size}") + + elif missing_sample_size_allowed: + result = df_to_test.query('~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') - if not qresult.empty: - raise ValidationError(None, "sample size must be >= {minimum_sample_size}") + if not result.empty: + raise ValidationError(None, "sample size must be NA or >= {minimum_sample_size}") def check_min_allowed_max_date(max_date, generation_date, weighted_option='unweighted'): switcher = { From 7b0785f0bbb657175420cd51164c589ee37c0a1d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 22 Sep 2020 13:18:24 -0400 Subject: [PATCH 043/151] Removed duplicate code in check_missing_dates(). Improved print statement. --- validator/delphi_validator/validate.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 6fa7eed7c..6526c76f2 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -89,15 +89,12 @@ def check_missing_dates(daily_filenames, sdate, edate): number_of_dates = edate - sdate + timedelta(days=1) date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} unique_dates = set() - unique_dates_obj = set() for daily_filename in daily_filenames: - unique_dates.add(daily_filename[0][0:8]) - for unique_date in unique_dates: - newdate_obj = datetime.strptime(unique_date, '%Y%m%d') - unique_dates_obj.add(newdate_obj) + unique_dates.add(datetime.strptime(daily_filename[0][0:8], '%Y%m%d')) - check_dateholes = date_seq.difference(unique_dates_obj) + check_dateholes = list(date_seq.difference(unique_dates)) + check_dateholes.sort() if check_dateholes: print("Missing dates are observed; if these dates are already in the API they would not be updated") @@ -278,9 +275,9 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d all_frames = [] - # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? - check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') - check_max_allowed_max_date(end_date, generation_date) + # # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + # check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + # check_max_allowed_max_date(end_date, generation_date) # First, check file formats check_missing_dates(validate_files, start_date, end_date) From 089ce9c5d65039d305709d2295902c134dcd85dd Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 22 Sep 2020 17:13:15 -0400 Subject: [PATCH 044/151] [wip] Adding in 'semirecent' checks according to R reference code --- validator/delphi_validator/validate.py | 57 +++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 6526c76f2..905eed89c 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -193,7 +193,21 @@ def check_max_allowed_max_date(max_date, generation_date): def reldiff_by_min(x, y): return (x - y) / min(x,y) -def check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo): +def check_rapid_change(df_to_test, df_to_compare, checking_date, date_list, sig, geo): + """ + Compare number of obervations per day in test dataframe vs reference dataframe. + + Arguments: + - df_to_test: pandas dataframe of "recent" CSV source data + - df_to_compare: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - checking_date + - date_list + - sig + - geo + + Returns: + - None + """ recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list) @@ -271,13 +285,14 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d export_files = read_filenames(export_dir) date_filter = make_date_filter(start_date, end_date) + # List of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(f,m)] all_frames = [] - # # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? - # check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') - # check_max_allowed_max_date(end_date, generation_date) + # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + check_max_allowed_max_date(end_date, generation_date) # First, check file formats check_missing_dates(validate_files, start_date, end_date) @@ -289,18 +304,26 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d check_bad_val(df, match.groupdict()['signal']) check_bad_se(df, missing_se_allowed) check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) + + # Get geo_type, date, and signal name as specified by CSV name. df['geo_type'] = match.groupdict()['geo_type'] df['date'] = match.groupdict()['date'] df['signal'] = match.groupdict()['signal'] + + # Add current CSV data to all_frames. all_frames.append(df) # TODO: Multi-indexed dataframe for a given (signal, geo_type) all_frames = pd.concat(all_frames) + # Get all expected combinations of geo_type and signal. geo_sig_cmbo = get_geo_sig_cmbo(data_source) + + # Get list of dates we expect to see in the CSV data. date_slist = df['date'].unique().tolist() date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) + # Get list of CSV names. filenames = [name_match_pair[0] for name_match_pair in validate_files] ## recent_lookbehind: start from the check date and working backward in time, @@ -313,6 +336,29 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d ## do we use to form the reference statistics? semirecent_lookbehind = timedelta(days=7) + start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, + max(all_frames["date"]) - max_check_lookbehind + 1) + end_checking_date = max(all_frames["date"]) + + if (start_checking_date > end_checking_date): + raise ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks") + + + for checking_date in range(start_checking_date, end_checking_date): + known_irregularities = get_known_irregularities(checking_date, filename) + + recent_cutoff_date = checking_date - recent_lookbehind + 1 + semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 + + recent_df_to_test = df_to_test.query('date <= @checking_date & date >= @recent_cutoff_date') + semirecent_df_to_test = df_to_test.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') + + if (recent_df_to_test["se"].isnull().mean() > 0.5): + print('Recent se values are >50% NA') + + + recent_rows_per_reporting_day + smooth_option_regex = re.compile(r'([^_]+)') @@ -343,11 +389,12 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d column_names = ["geo_value", "val", "se", "sample_size", "time_value"] recent_api_df = recent_api_df.reindex(columns=column_names) + if (recent_df["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') if sanity_check_rows_per_day: - check_rapid_change(checking_date, recent_df, recent_api_df, date_list, sig, geo) + check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) if sanity_check_value_diffs: check_avg_val_diffs(recent_df, recent_api_df, smooth_option) From b9ce6157de53246e6389017f31d409d628377098 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 22 Sep 2020 17:15:21 -0400 Subject: [PATCH 045/151] [wip] Generalize check_rapid_change() --- validator/delphi_validator/validate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 905eed89c..adb0a9b02 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -208,11 +208,11 @@ def check_rapid_change(df_to_test, df_to_compare, checking_date, date_list, sig, Returns: - None """ - recent_rows_per_reporting_day = recent_df[recent_df['time_value'] == checking_date].shape[0] - recent_api_rows_per_reporting_day = recent_api_df.shape[0] / len(date_list) + test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] == checking_date].shape[0] + reference_rows_per_reporting_day = df_to_compare.shape[0] / len(date_list) - if(abs(reldiff_by_min(recent_rows_per_reporting_day, recent_api_rows_per_reporting_day)) > 0.35): - raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (latest vs recent window of data)") + if(abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35): + raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs recent window of data)") def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): # pdb.set_trace() From 138cc5c5bcc2c0afd799436d0866e86ccae35a7f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 22 Sep 2020 18:51:23 -0400 Subject: [PATCH 046/151] [wip] Replace Nones in API datapull with NAs to make numerical manipulation easier. Added docstring and applied some fixes to check_avg_value_diffs() --- validator/delphi_validator/validate.py | 63 ++++++++++++++++++-------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index adb0a9b02..0d6ed1b07 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -214,39 +214,49 @@ def check_rapid_change(df_to_test, df_to_compare, checking_date, date_list, sig, if(abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35): raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs recent window of data)") -def check_avg_val_diffs(recent_df, recent_api_df, smooth_option): +def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): + """ + Compare average values in test dataframe vs reference dataframe. + + Arguments: + - df_to_test: pandas dataframe of "recent" CSV source data + - df_to_compare: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - smooth_option: "raw" or "smoothed", choosen + + Returns: + - None + """ # pdb.set_trace() - # TODO: something is wrong with this check definition. - recent_df = recent_df.drop(columns=['geo_id']) - mean_recent_df = recent_df[['val', 'se', 'sample_size']].mean() - recent_api_df = recent_api_df.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() - recent_api_df = recent_api_df.drop(columns=['geo_value']) + # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. + df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_test = df_to_test.drop(columns=['geo_id']) + + # Average val, se, and sample_size values together over all geo_ids. + test_mean = df_to_test.mean() + + df_to_compare = df_to_compare.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_compare = df_to_compare.drop(columns=['geo_value']) - mean_recent_api_df = recent_api_df.mean() + reference_mean = df_to_compare.mean() - mean_stddiff = ((mean_recent_df - mean_recent_api_df).mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) - mean_stdabsdiff = ((mean_recent_df - mean_recent_api_df).abs().mean() * 2) / (mean_recent_df.mean() + mean_recent_api_df.mean()) + # TODO: Look at reference code for this section. + mean_stddiff = ((test_mean - reference_mean).mean() * 2) / (test_mean.mean() + reference_mean.mean()) + mean_stdabsdiff = ((test_mean - reference_mean).abs().mean() * 2) / (test_mean.mean() + reference_mean.mean()) classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) - # Code reference from R code - # changesum.by.variable.with.flags = changesum.by.variable %>>% - # dplyr::mutate(mean.stddiff.high = abs(mean.stddiff) > thresholds[["mean.stddiff"]] | - # variable=="val" & abs(mean.stddiff) > thresholds[["val.mean.stddiff"]], - # mean.stdabsdiff.high = mean.stdabsdiff > thresholds[["mean.stdabsdiff"]]) %>>% - # Todo - Check whats the purpose of variable=="val" in the above statement - switcher = { 'raw': raw_thresholds, 'smoothed': smoothed_thresholds, } # Get the function from switcher dictionary thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - + + # TODO: comparisons wrong here. Look at reference code. mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() # or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool() @@ -336,6 +346,7 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d ## do we use to form the reference statistics? semirecent_lookbehind = timedelta(days=7) + # TODO: Check recent data against semirecent and API data. start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, max(all_frames["date"]) - max_check_lookbehind + 1) end_checking_date = max(all_frames["date"]) @@ -356,8 +367,18 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d if (recent_df_to_test["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') + if sanity_check_rows_per_day: + check_rapid_change(recent_df_to_test, semirecent_df_to_test) + + recent_df_to_test["recency"] = "recent" + semirecent_df_to_test["recency"] = "semirecent" + + recency_df = pd.concat([recent_df_to_test, semirecent_df_to_test]) + + # TODO: Continue with check_avg_val_diffs() here. + + - recent_rows_per_reporting_day smooth_option_regex = re.compile(r'([^_]+)') @@ -382,7 +403,10 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d recent_end_date = checking_date - recent_lookbehind recent_begin_date = checking_date - max_check_lookbehind recent_api_df = covidcast.signal(data_source, sig, recent_begin_date, recent_end_date, geo) - + + # Replace None with NA to make numerical manipulation easier. + recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) + recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) @@ -399,7 +423,6 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d if sanity_check_value_diffs: check_avg_val_diffs(recent_df, recent_api_df, smooth_option) - # TODO: Add semirecent check? kroc += 1 if kroc == 2: break From 4e57493f41e7f3cfdb443d69fa67bab82986fc26 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 23 Sep 2020 14:47:55 -0400 Subject: [PATCH 047/151] [wip] Fixed calculation of mean_stddiff and mean_stdabsdiff in check_avg_val_diffs() --- validator/delphi_validator/validate.py | 93 ++++++++++++++------------ 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 0d6ed1b07..3c43ccba3 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -226,23 +226,29 @@ def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): Returns: - None """ - # pdb.set_trace() - # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() - df_to_test = df_to_test.drop(columns=['geo_id']) - - # Average val, se, and sample_size values together over all geo_ids. - test_mean = df_to_test.mean() - - df_to_compare = df_to_compare.groupby(['geo_value'], as_index=False)[['val', 'se', 'sample_size']].mean() - df_to_compare = df_to_compare.drop(columns=['geo_value']) - - reference_mean = df_to_compare.mean() - - # TODO: Look at reference code for this section. - mean_stddiff = ((test_mean - reference_mean).mean() * 2) / (test_mean.mean() + reference_mean.mean()) - mean_stdabsdiff = ((test_mean - reference_mean).abs().mean() * 2) / (test_mean.mean() + reference_mean.mean()) + df_to_test["type"] = "test" + + df_to_compare = df_to_compare.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_compare["type"] = "reference" + + df_all = pd.concat([df_to_compare, df_to_test]) + + + df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"]).pivot(index=("geo_id", "variable"), columns="type", values="value").reset_index(("geo_id","variable")).dropna().assign( + type_diff=lambda x: x["test"] - x["reference"], + abs_type_diff=lambda x: abs(x["type_diff"]) + ).groupby("variable", as_index=False).agg( + mean_type_diff=("type_diff", "mean"), + mean_abs_type_diff=("abs_type_diff", "mean"), + mean_test_var=("test", "mean"), + mean_ref_var=("reference", "mean") + ).assign( + mean_stddiff=lambda x: 2 * x["mean_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]), + mean_stdabsdiff=lambda x: 2 * x["mean_abs_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]) + )[["variable", "mean_stddiff", "mean_stdabsdiff"]] + classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) @@ -300,9 +306,9 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d all_frames = [] - # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? - check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') - check_max_allowed_max_date(end_date, generation_date) + # # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + # check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + # check_max_allowed_max_date(end_date, generation_date) # First, check file formats check_missing_dates(validate_files, start_date, end_date) @@ -311,7 +317,7 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d validate_daily(df, filename, max_check_lookbehind, generation_date) check_bad_geo_id(df, match.groupdict()['geo_type']) - check_bad_val(df, match.groupdict()['signal']) + # check_bad_val(df, match.groupdict()['signal']) check_bad_se(df, missing_se_allowed) check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) @@ -346,36 +352,36 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d ## do we use to form the reference statistics? semirecent_lookbehind = timedelta(days=7) - # TODO: Check recent data against semirecent and API data. - start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, - max(all_frames["date"]) - max_check_lookbehind + 1) - end_checking_date = max(all_frames["date"]) + # # TODO: Check recent data against semirecent and API data. + # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, + # max(all_frames["date"]) - max_check_lookbehind + 1) + # end_checking_date = max(all_frames["date"]) - if (start_checking_date > end_checking_date): - raise ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks") + # if (start_checking_date > end_checking_date): + # raise ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks") - for checking_date in range(start_checking_date, end_checking_date): - known_irregularities = get_known_irregularities(checking_date, filename) + # for checking_date in range(start_checking_date, end_checking_date): + # known_irregularities = get_known_irregularities(checking_date, filename) - recent_cutoff_date = checking_date - recent_lookbehind + 1 - semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 + # recent_cutoff_date = checking_date - recent_lookbehind + 1 + # semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 - recent_df_to_test = df_to_test.query('date <= @checking_date & date >= @recent_cutoff_date') - semirecent_df_to_test = df_to_test.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') + # recent_df_to_test = df_to_test.query('date <= @checking_date & date >= @recent_cutoff_date') + # semirecent_df_to_test = df_to_test.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') - if (recent_df_to_test["se"].isnull().mean() > 0.5): - print('Recent se values are >50% NA') + # if (recent_df_to_test["se"].isnull().mean() > 0.5): + # print('Recent se values are >50% NA') - if sanity_check_rows_per_day: - check_rapid_change(recent_df_to_test, semirecent_df_to_test) + # if sanity_check_rows_per_day: + # check_rapid_change(recent_df_to_test, semirecent_df_to_test) - recent_df_to_test["recency"] = "recent" - semirecent_df_to_test["recency"] = "semirecent" + # recent_df_to_test["recency"] = "recent" + # semirecent_df_to_test["recency"] = "semirecent" - recency_df = pd.concat([recent_df_to_test, semirecent_df_to_test]) + # recency_df = pd.concat([recent_df_to_test, semirecent_df_to_test]) - # TODO: Continue with check_avg_val_diffs() here. + # # TODO: Continue with check_avg_val_diffs() here. @@ -407,18 +413,19 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d # Replace None with NA to make numerical manipulation easier. recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) - recent_api_df.rename(columns={'stderr': 'se', 'value': 'val'}, inplace = True) + # Rename columns. + recent_api_df.rename(columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace = True) recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) - column_names = ["geo_value", "val", "se", "sample_size", "time_value"] + column_names = ["geo_id", "val", "se", "sample_size", "time_value"] recent_api_df = recent_api_df.reindex(columns=column_names) if (recent_df["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') - if sanity_check_rows_per_day: - check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) + # if sanity_check_rows_per_day: + # check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) if sanity_check_value_diffs: check_avg_val_diffs(recent_df, recent_api_df, smooth_option) From 9d2d043fd1f1583acd99fb6057933bb457b0e94e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 23 Sep 2020 17:17:50 -0400 Subject: [PATCH 048/151] Fixed check_avg_val_diffs(). Now runs like reference code. Added comments. --- validator/delphi_validator/validate.py | 38 ++++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 3c43ccba3..5125aa1f0 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -221,7 +221,7 @@ def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): Arguments: - df_to_test: pandas dataframe of "recent" CSV source data - df_to_compare: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - smooth_option: "raw" or "smoothed", choosen + - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") Returns: - None @@ -235,7 +235,7 @@ def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): df_all = pd.concat([df_to_compare, df_to_test]) - + # For each variable type (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"]).pivot(index=("geo_id", "variable"), columns="type", values="value").reset_index(("geo_id","variable")).dropna().assign( type_diff=lambda x: x["test"] - x["reference"], abs_type_diff=lambda x: abs(x["type_diff"]) @@ -248,10 +248,10 @@ def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): mean_stddiff=lambda x: 2 * x["mean_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]), mean_stdabsdiff=lambda x: 2 * x["mean_abs_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]) )[["variable", "mean_stddiff", "mean_stdabsdiff"]] - - classes = ['mean.stddiff', 'val.mean.stddiff', 'mean.stdabsdiff'] - raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes) + # Set thresholds for raw and smoothed variables. + classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] + raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes).T smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) @@ -259,16 +259,18 @@ def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): 'raw': raw_thresholds, 'smoothed': smoothed_thresholds, } - # Get the function from switcher dictionary + + # Get the selected thresholds from switcher dictionary thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - - # TODO: comparisons wrong here. Look at reference code. - mean_stddiff_high = (np.absolute(mean_stddiff) > thres.loc['mean.stddiff']).bool() # or (np.absolute(mean_stddiff) > thres.loc['val.mean.stddiff"']).bool() - mean_stdabsdiff_high = (mean_stdabsdiff > thres.loc['mean.stdabsdiff']).bool() + # Check if the calculated mean differences are high, compared to the thresholds. + mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ((df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) + mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() - if mean_stddiff_high or mean_stdabsdiff_high: - raise ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & semirecent data seem' \ + flag = mean_stddiff_high or mean_stdabsdiff_high + + if flag: + raise ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & refernce data (either semirecent or from API) seem' \ + 'large --- either large increase tending toward one direction or large mean absolute' \ + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.') @@ -306,9 +308,9 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d all_frames = [] - # # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? - # check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') - # check_max_allowed_max_date(end_date, generation_date) + # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + check_max_allowed_max_date(end_date, generation_date) # First, check file formats check_missing_dates(validate_files, start_date, end_date) @@ -317,7 +319,7 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d validate_daily(df, filename, max_check_lookbehind, generation_date) check_bad_geo_id(df, match.groupdict()['geo_type']) - # check_bad_val(df, match.groupdict()['signal']) + check_bad_val(df, match.groupdict()['signal']) check_bad_se(df, missing_se_allowed) check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) @@ -424,8 +426,8 @@ def validate(export_dir, start_date, end_date, data_source, params, generation_d if (recent_df["se"].isnull().mean() > 0.5): print('Recent se values are >50% NA') - # if sanity_check_rows_per_day: - # check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) + if sanity_check_rows_per_day: + check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) if sanity_check_value_diffs: check_avg_val_diffs(recent_df, recent_api_df, smooth_option) From 7f6a919d422c2d0933dec3c4bf54c5a9231bcdfb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 24 Sep 2020 10:10:43 -0400 Subject: [PATCH 049/151] Validator now uses a Validator class to allow validation to complete while collecting errors, warnings, and messages, which are printed on exit. --- validator/delphi_validator/run.py | 5 +- validator/delphi_validator/validate.py | 759 +++++++++++++------------ 2 files changed, 393 insertions(+), 371 deletions(-) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 4b3ee32db..1ac7a7c6a 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd from delphi_utils import read_params -from .validate import validate +from .validate import Validator from .datafetcher import read_filenames @@ -20,4 +20,5 @@ def run_module(): dtobj_sdate = datetime.date(datetime.strptime(params['start_date'], '%Y-%m-%d')) dtobj_edate = datetime.date(datetime.strptime(params['end_date'], '%Y-%m-%d')) - validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate, data_source, params) + validator = Validator() + validator.validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate, data_source, params) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 5125aa1f0..e22728d63 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -25,416 +25,437 @@ def __init__(self, expression, message): self.expression = expression self.message = message -def make_date_filter(start_date, end_date): - start_code = int(start_date.strftime("%Y%m%d")) - end_code = int(end_date.strftime("%Y%m%d")) - def f(filename, match): - if not match: return False - code = int(match.groupdict()['date']) - return code > start_code and code < end_code - return f - -def validate_daily(df_to_test, nameformat, max_check_lookbehind, generation_date): - """ - Perform some automated format & sanity checks of inputs. - - Arguments: - - df_to_test: pandas dataframe of CSV source data - - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" - - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test - - Returns: - - None - """ - - if (not isinstance(max_check_lookbehind, timedelta)): - raise ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be of type datetime.timedelta") - - if( not isinstance(generation_date, date) or generation_date > date.today()): - raise ValidationError(generation_date, f"generation.date ({generation.date}) must be a datetime.date type and not in the future.") - - pattern_found = filename_regex.match(nameformat) - if (not nameformat or not pattern_found): - raise ValidationError(nameformat, 'nameformat ({nameformat}) not recognized') - - if not isinstance(df_to_test, pd.DataFrame): - raise ValidationError(nameformat, 'df_to_test must be a pandas dataframe.') - - # TODO: check column names and types in df_to_test. Currently skipped since load_csv() specifies field names and types on read. Extra columns will simply be ignored during later processing. - - -def check_bad_geo_id(df_to_test, geo_type): - """ - Check validity of geo type and values, according to regex pattern. - - Arguments: - - df_to_test: pandas dataframe of CSV source data - - geo_type: string from CSV name specifying geo type (state, county, etc) of data - - Returns: - - None - """ - if geo_type not in negated_regex_dict: - raise ValidationError(geo_type,"Unrecognized geo type") - - def find_all_unexpected_geo_ids(df_to_test, negated_regex): - unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] - if(len(unexpected_geos) > 0): - raise ValidationError(unexpected_geos,"Non-conforming geo_ids exist!") - - find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - -def check_missing_dates(daily_filenames, sdate, edate): - number_of_dates = edate - sdate + timedelta(days=1) - date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} - unique_dates = set() - - for daily_filename in daily_filenames: - unique_dates.add(datetime.strptime(daily_filename[0][0:8], '%Y%m%d')) - - check_dateholes = list(date_seq.difference(unique_dates)) - check_dateholes.sort() - - if check_dateholes: - print("Missing dates are observed; if these dates are already in the API they would not be updated") - print(check_dateholes) - -def check_bad_val(df_to_test, signal_type): - """ - Check value field for validity. - - Arguments: - - df_to_test: pandas dataframe of CSV source data - - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data - - Returns: - - None - """ - proportion_option = True if 'prop' in signal_type or 'pct' in signal_type else False - - if proportion_option: - if (not df_to_test[(df_to_test['val'] > 100)].empty): - raise ValidationError(signal_type, "val column can't have any cell greater than 100") - - if (df_to_test['val'].isnull().values.any()): - raise ValidationError(None,"val column can't have any cell that is NA") - - if (not df_to_test[(df_to_test['val'] < 0)].empty): - raise ValidationError(None,"val column can't have any cell smaller than 0") - -def check_bad_se(df_to_test, missing_se_allowed): - """ - Check standard errors for validity. - - Arguments: - - df_to_test: pandas dataframe of CSV source data - - missing_se_allowed: boolean specified in params.json - - Returns: - - None - """ - df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) - - df_to_test['se']= df_to_test['se'].round(3) - df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) - - if not missing_se_allowed: - if (df_to_test['se'].isnull().values.any()): - raise ValidationError(None, "se must not be NA") + +class Validator(object): + + def __init__(self): + self.raised = [] + + def make_date_filter(self, start_date, end_date): + start_code = int(start_date.strftime("%Y%m%d")) + end_code = int(end_date.strftime("%Y%m%d")) + def f(filename, match): + if not match: return False + code = int(match.groupdict()['date']) + return code > start_code and code < end_code + return f + + def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generation_date): + """ + Perform some automated format & sanity checks of inputs. - result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + Arguments: + - df_to_test: pandas dataframe of CSV source data + - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" + - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + + Returns: + - None + """ + + if (not isinstance(max_check_lookbehind, timedelta)): + self.raised.append(ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be of type datetime.timedelta")) - if not result.empty: - raise ValidationError(None, "se must be in (0,min(50,val*(1+eps))]") + if( not isinstance(generation_date, date) or generation_date > date.today()): + self.raised.append(ValidationError(generation_date, f"generation.date ({generation.date}) must be a datetime.date type and not in the future.")) + + pattern_found = filename_regex.match(nameformat) + if (not nameformat or not pattern_found): + self.raised.append(ValidationError(nameformat, 'nameformat ({nameformat}) not recognized')) - elif missing_se_allowed: - result = df_to_test.query('~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') + if not isinstance(df_to_test, pd.DataFrame): + self.raised.append(ValidationError(nameformat, 'df_to_test must be a pandas dataframe.')) - if not result.empty: - raise ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]") - - result = df_to_test.query('(val == 0) & (se == 0)') + # TODO: check column names and types in df_to_test. Currently skipped since load_csv() specifies field names and types on read. Extra columns will simply be ignored during later processing. - if not result.empty: - raise ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se") -def check_bad_sample_size(df_to_test, minimum_sample_size, missing_sample_size_allowed): - if not missing_sample_size_allowed: - if (df_to_test['sample_size'].isnull().values.any()): - raise ValidationError(None, "sample_size must not be NA") + def check_bad_geo_id(self, df_to_test, geo_type): + """ + Check validity of geo type and values, according to regex pattern. + + Arguments: + - df_to_test: pandas dataframe of CSV source data + - geo_type: string from CSV name specifying geo type (state, county, etc) of data + + Returns: + - None + """ + if geo_type not in negated_regex_dict: + self.raised.append(ValidationError(geo_type,"Unrecognized geo type")) + + def find_all_unexpected_geo_ids(df_to_test, negated_regex): + unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] + if(len(unexpected_geos) > 0): + self.raised.append(ValidationError(unexpected_geos,"Non-conforming geo_ids exist!")) - result = df_to_test.query('(sample_size < @minimum_sample_size)') + find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - if not result.empty: - raise ValidationError(None, "sample size must be >= {minimum_sample_size}") + def check_missing_dates(self, daily_filenames, sdate, edate): + number_of_dates = edate - sdate + timedelta(days=1) + date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} + unique_dates = set() + + for daily_filename in daily_filenames: + unique_dates.add(datetime.strptime(daily_filename[0][0:8], '%Y%m%d')) + + check_dateholes = list(date_seq.difference(unique_dates)) + check_dateholes.sort() + + if check_dateholes: + self.raised.append((check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) + + def check_bad_val(self, df_to_test, signal_type): + """ + Check value field for validity. + + Arguments: + - df_to_test: pandas dataframe of CSV source data + - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data + + Returns: + - None + """ + proportion_option = True if 'prop' in signal_type or 'pct' in signal_type else False + + if proportion_option: + if (not df_to_test[(df_to_test['val'] > 100)].empty): + self.raised.append(ValidationError(signal_type, "val column can't have any cell greater than 100")) + + if (df_to_test['val'].isnull().values.any()): + self.raised.append(ValidationError(None,"val column can't have any cell that is NA")) + + if (not df_to_test[(df_to_test['val'] < 0)].empty): + self.raised.append(ValidationError(None,"val column can't have any cell smaller than 0")) + + def check_bad_se(self, df_to_test, missing_se_allowed): + """ + Check standard errors for validity. + + Arguments: + - df_to_test: pandas dataframe of CSV source data + - missing_se_allowed: boolean specified in params.json + + Returns: + - None + """ + df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) + + df_to_test['se']= df_to_test['se'].round(3) + df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) + + if not missing_se_allowed: + if (df_to_test['se'].isnull().values.any()): + self.raised.append(ValidationError(None, "se must not be NA")) + + result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + + if not result.empty: + self.raised.append(ValidationError(None, "se must be in (0,min(50,val*(1+eps))]")) - elif missing_sample_size_allowed: - result = df_to_test.query('~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') + elif missing_se_allowed: + result = df_to_test.query('~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') + + if not result.empty: + self.raised.append(ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]")) + + result = df_to_test.query('(val == 0) & (se == 0)') if not result.empty: - raise ValidationError(None, "sample size must be NA or >= {minimum_sample_size}") - -def check_min_allowed_max_date(max_date, generation_date, weighted_option='unweighted'): - switcher = { - 'unweighted': timedelta(days=1), - 'weighted': timedelta(days=4) - } - # Get the function from switcher dictionary - thres = switcher.get(weighted_option, lambda: "Invalid weighting option") - - if (max_date < generation_date - thres): - raise ValidationError(None, "latest date of generated file seems too long ago") - -def check_max_allowed_max_date(max_date, generation_date): - if (max_date < generation_date - timedelta(days=1)): - raise ValidationError(None, "latest date of generated file seems too recent") - -def reldiff_by_min(x, y): - return (x - y) / min(x,y) - -def check_rapid_change(df_to_test, df_to_compare, checking_date, date_list, sig, geo): - """ - Compare number of obervations per day in test dataframe vs reference dataframe. - - Arguments: - - df_to_test: pandas dataframe of "recent" CSV source data - - df_to_compare: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - checking_date - - date_list - - sig - - geo - - Returns: - - None - """ - test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] == checking_date].shape[0] - reference_rows_per_reporting_day = df_to_compare.shape[0] / len(date_list) - - if(abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35): - raise ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs recent window of data)") - -def check_avg_val_diffs(df_to_test, df_to_compare, smooth_option): - """ - Compare average values in test dataframe vs reference dataframe. - - Arguments: - - df_to_test: pandas dataframe of "recent" CSV source data - - df_to_compare: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") - - Returns: - - None - """ - # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. - df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() - df_to_test["type"] = "test" - - df_to_compare = df_to_compare.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() - df_to_compare["type"] = "reference" - - df_all = pd.concat([df_to_compare, df_to_test]) - - # For each variable type (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. - df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"]).pivot(index=("geo_id", "variable"), columns="type", values="value").reset_index(("geo_id","variable")).dropna().assign( - type_diff=lambda x: x["test"] - x["reference"], - abs_type_diff=lambda x: abs(x["type_diff"]) - ).groupby("variable", as_index=False).agg( - mean_type_diff=("type_diff", "mean"), - mean_abs_type_diff=("abs_type_diff", "mean"), - mean_test_var=("test", "mean"), - mean_ref_var=("reference", "mean") - ).assign( - mean_stddiff=lambda x: 2 * x["mean_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]), - mean_stdabsdiff=lambda x: 2 * x["mean_abs_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]) - )[["variable", "mean_stddiff", "mean_stdabsdiff"]] - - # Set thresholds for raw and smoothed variables. - classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] - raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes).T - - smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) - - switcher = { - 'raw': raw_thresholds, - 'smoothed': smoothed_thresholds, - } - - # Get the selected thresholds from switcher dictionary - thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - - # Check if the calculated mean differences are high, compared to the thresholds. - mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ((df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) - mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() - - flag = mean_stddiff_high or mean_stdabsdiff_high - - if flag: - raise ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & refernce data (either semirecent or from API) seem' \ - + 'large --- either large increase tending toward one direction or large mean absolute' \ - + 'difference, relative to average values of corresponding variables. For the former' \ - + 'check, tolerances for `val` are more restrictive than those for other columns.') - -def validate(export_dir, start_date, end_date, data_source, params, generation_date = date.today()): - """ - Perform data checks. - - Arguments: - - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test - - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - - sanity_check_rows_per_day - - sanity_check_value_diffs: - - check_vs_working - - Returns: - - None - """ - # Get user settings from params or if not provided, set default. - max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) - minimum_sample_size = params.get('minimum_sample_size', 100) - missing_se_allowed = params.get('missing_se_allowed', False) - missing_sample_size_allowed = params.get('missing_sample_size_allowed', False) - - sanity_check_rows_per_day = params.get('sanity_check_rows_per_day', True) - sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) - check_vs_working = params.get('check_vs_working', True) - - - export_files = read_filenames(export_dir) - date_filter = make_date_filter(start_date, end_date) - # List of tuples of CSV names and regex match objects. - validate_files = [(f, m) for (f, m) in export_files if date_filter(f,m)] - - all_frames = [] - - # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? - check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') - check_max_allowed_max_date(end_date, generation_date) - - # First, check file formats - check_missing_dates(validate_files, start_date, end_date) - for filename, match in validate_files: - df = load_csv(join(export_dir, filename)) - - validate_daily(df, filename, max_check_lookbehind, generation_date) - check_bad_geo_id(df, match.groupdict()['geo_type']) - check_bad_val(df, match.groupdict()['signal']) - check_bad_se(df, missing_se_allowed) - check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) + self.raised.append(ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) - # Get geo_type, date, and signal name as specified by CSV name. - df['geo_type'] = match.groupdict()['geo_type'] - df['date'] = match.groupdict()['date'] - df['signal'] = match.groupdict()['signal'] + def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_size_allowed): + if not missing_sample_size_allowed: + if (df_to_test['sample_size'].isnull().values.any()): + self.raised.append(ValidationError(None, "sample_size must not be NA")) + + result = df_to_test.query('(sample_size < @minimum_sample_size)') - # Add current CSV data to all_frames. - all_frames.append(df) + if not result.empty: + self.raised.append(ValidationError(None, "sample size must be >= {minimum_sample_size}")) - # TODO: Multi-indexed dataframe for a given (signal, geo_type) - all_frames = pd.concat(all_frames) - - # Get all expected combinations of geo_type and signal. - geo_sig_cmbo = get_geo_sig_cmbo(data_source) + elif missing_sample_size_allowed: + result = df_to_test.query('~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') - # Get list of dates we expect to see in the CSV data. - date_slist = df['date'].unique().tolist() - date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) + if not result.empty: + self.raised.append(ValidationError(None, "sample size must be NA or >= {minimum_sample_size}")) - # Get list of CSV names. - filenames = [name_match_pair[0] for name_match_pair in validate_files] + def check_min_allowed_max_date(self, max_date, generation_date, weighted_option='unweighted'): + switcher = { + 'unweighted': timedelta(days=1), + 'weighted': timedelta(days=4) + } + # Get the function from switcher dictionary + thres = switcher.get(weighted_option, lambda: "Invalid weighting option") - ## recent_lookbehind: start from the check date and working backward in time, - ## how many days do we include in the window of date to check for anomalies? - ## Choosing 1 day checks just the check data itself. - recent_lookbehind = timedelta(days=1) + if (max_date < generation_date - thres): + self.raised.append(ValidationError(None, "latest date of generated file seems too long ago")) - ## semirecent_lookbehind: starting from the check date and working backward - ## in time, how many days -- before subtracting out the "recent" days --- - ## do we use to form the reference statistics? - semirecent_lookbehind = timedelta(days=7) + def check_max_allowed_max_date(self, max_date, generation_date): + if (max_date < generation_date - timedelta(days=1)): + self.raised.append(ValidationError(None, "latest date of generated file seems too recent")) - # # TODO: Check recent data against semirecent and API data. - # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, - # max(all_frames["date"]) - max_check_lookbehind + 1) - # end_checking_date = max(all_frames["date"]) + def reldiff_by_min(self, x, y): + return (x - y) / min(x,y) - # if (start_checking_date > end_checking_date): - # raise ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks") + def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_list, sig, geo): + """ + Compare number of obervations per day in test dataframe vs reference dataframe. + + Arguments: + - df_to_test: pandas dataframe of "recent" CSV source data + - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - checking_date + - date_list + - sig + - geo + + Returns: + - None + """ + test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] == checking_date].shape[0] + reference_rows_per_reporting_day = df_to_reference.shape[0] / len(date_list) + + if(abs(self.reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35): + self.raised.append(ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs recent window of data)")) + def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): + """ + Compare average values in test dataframe vs reference dataframe. + + Arguments: + - df_to_test: pandas dataframe of "recent" CSV source data + - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") + + Returns: + - None + """ + # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. + df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_test["type"] = "test" + + df_to_reference = df_to_reference.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_reference["type"] = "reference" + + df_all = pd.concat([df_to_test, df_to_reference]) + + # For each variable type (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. + df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"]).pivot(index=("geo_id", "variable"), columns="type", values="value").reset_index(("geo_id","variable")).dropna().assign( + type_diff=lambda x: x["test"] - x["reference"], + abs_type_diff=lambda x: abs(x["type_diff"]) + ).groupby("variable", as_index=False).agg( + mean_type_diff=("type_diff", "mean"), + mean_abs_type_diff=("abs_type_diff", "mean"), + mean_test_var=("test", "mean"), + mean_ref_var=("reference", "mean") + ).assign( + mean_stddiff=lambda x: 2 * x["mean_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]), + mean_stdabsdiff=lambda x: 2 * x["mean_abs_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]) + )[["variable", "mean_stddiff", "mean_stdabsdiff"]] + + # Set thresholds for raw and smoothed variables. + classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] + raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes).T + + smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) + + switcher = { + 'raw': raw_thresholds, + 'smoothed': smoothed_thresholds, + } + + # Get the selected thresholds from switcher dictionary + thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") + + # Check if the calculated mean differences are high, compared to the thresholds. + mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ((df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) + mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() + + flag = mean_stddiff_high or mean_stdabsdiff_high + + if flag: + self.raised.append(ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & refernce data (either semirecent or from API) seem' \ + + 'large --- either large increase tending toward one direction or large mean absolute' \ + + 'difference, relative to average values of corresponding variables. For the former' \ + + 'check, tolerances for `val` are more restrictive than those for other columns.')) + + def validate(self, export_dir, start_date, end_date, data_source, params, generation_date = date.today()): + """ + Runs all data checks. + + Arguments: + + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - sanity_check_rows_per_day + - sanity_check_value_diffs: + - check_vs_working + + Returns: + - None + """ + # Get user settings from params or if not provided, set default. + max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) + minimum_sample_size = params.get('minimum_sample_size', 100) + missing_se_allowed = params.get('missing_se_allowed', False) + missing_sample_size_allowed = params.get('missing_sample_size_allowed', False) + + sanity_check_rows_per_day = params.get('sanity_check_rows_per_day', True) + sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) + check_vs_working = params.get('check_vs_working', True) + + + export_files = read_filenames(export_dir) + date_filter = self.make_date_filter(start_date, end_date) + # List of tuples of CSV names and regex match objects. + validate_files = [(f, m) for (f, m) in export_files if date_filter(f,m)] + + all_frames = [] + + # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + self.check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + self.check_max_allowed_max_date(end_date, generation_date) - # for checking_date in range(start_checking_date, end_checking_date): - # known_irregularities = get_known_irregularities(checking_date, filename) + # First, check file formats + self.check_missing_dates(validate_files, start_date, end_date) + for filename, match in validate_files: + df = load_csv(join(export_dir, filename)) - # recent_cutoff_date = checking_date - recent_lookbehind + 1 - # semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 + self.validate_daily(df, filename, max_check_lookbehind, generation_date) + self.check_bad_geo_id(df, match.groupdict()['geo_type']) + self.check_bad_val(df, match.groupdict()['signal']) + self.check_bad_se(df, missing_se_allowed) + self.check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) - # recent_df_to_test = df_to_test.query('date <= @checking_date & date >= @recent_cutoff_date') - # semirecent_df_to_test = df_to_test.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') + # Get geo_type, date, and signal name as specified by CSV name. + df['geo_type'] = match.groupdict()['geo_type'] + df['date'] = match.groupdict()['date'] + df['signal'] = match.groupdict()['signal'] - # if (recent_df_to_test["se"].isnull().mean() > 0.5): - # print('Recent se values are >50% NA') + # Add current CSV data to all_frames. + all_frames.append(df) - # if sanity_check_rows_per_day: - # check_rapid_change(recent_df_to_test, semirecent_df_to_test) + # TODO: Multi-indexed dataframe for a given (signal, geo_type) + all_frames = pd.concat(all_frames) + + # Get all expected combinations of geo_type and signal. + geo_sig_cmbo = get_geo_sig_cmbo(data_source) - # recent_df_to_test["recency"] = "recent" - # semirecent_df_to_test["recency"] = "semirecent" + # Get list of dates we expect to see in the CSV data. + date_slist = df['date'].unique().tolist() + date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) - # recency_df = pd.concat([recent_df_to_test, semirecent_df_to_test]) + # Get list of CSV names. + filenames = [name_match_pair[0] for name_match_pair in validate_files] - # # TODO: Continue with check_avg_val_diffs() here. + ## recent_lookbehind: start from the check date and working backward in time, + ## how many days do we include in the window of date to check for anomalies? + ## Choosing 1 day checks just the check data itself. + recent_lookbehind = timedelta(days=1) + ## semirecent_lookbehind: starting from the check date and working backward + ## in time, how many days -- before subtracting out the "recent" days --- + ## do we use to form the reference statistics? + semirecent_lookbehind = timedelta(days=7) + # ## New from reference code. + # # TODO: Check recent data against semirecent and API data. + # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, + # max(all_frames["date"]) - max_check_lookbehind + 1) + # end_checking_date = max(all_frames["date"]) + # if (start_checking_date > end_checking_date): + # self.raised.append(ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks")) - smooth_option_regex = re.compile(r'([^_]+)') + # # Loop over all sets of dates for a given CSV. + # for checking_date in range(start_checking_date, end_checking_date): + # known_irregularities = get_known_irregularities(checking_date, filename) - kroc = 0 + # recent_cutoff_date = checking_date - recent_lookbehind + 1 + # semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 - # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). - for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): - - m = smooth_option_regex.match(sig) - smooth_option = m.group(1) + # recent_df_to_test = df_to_test.query('date <= @checking_date & date >= @recent_cutoff_date') + # semirecent_df_to_test = df_to_test.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') - if smooth_option not in ('raw', 'smoothed'): - smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' - - #recent_df.set_index("time_value", inplace = True) - print("Printing recent_df scenes:", recent_df.shape) - print(recent_df) - for checking_date in date_list: - #print(recent_df.loc[checking_date,:]) - # -recent- dataframe run backwards from the checking_date - recent_end_date = checking_date - recent_lookbehind - recent_begin_date = checking_date - max_check_lookbehind - recent_api_df = covidcast.signal(data_source, sig, recent_begin_date, recent_end_date, geo) - - # Replace None with NA to make numerical manipulation easier. - recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) - - # Rename columns. - recent_api_df.rename(columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace = True) - recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) + # if (recent_df_to_test["se"].isnull().mean() > 0.5): + # self.raised.append('Recent se values are >50% NA') + + # if sanity_check_rows_per_day: + # self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) + + # # TODO: Get smooth_option from CSV name. + # if sanity_check_value_diffs: + # self.check_avg_val_diffs(recent_df_to_test, semirecent_df_to_test, smooth_option) + + # # ## Compare vs. covidcast in farther-back days: + # # if (max(covidalert.df.to.test[["date"]]) < max(covidcast.reference.df[["date"]])) { + # # print(nameformat) + # # print(max(covidalert.df.to.test[["date"]])) + # # print(max(covidcast.reference.df[["date"]])) + # # stop ('covidcast reference df has days beyond the max date in the =covidalert.df.to.test=; checks are not constructed to handle this case, and this situation may indicate that something locally is out-of-date, or, if the local working covidalert files have already been compared against covidcast, that there is a bug somewhere') + # # } + # # if (check.vs.working) { + # # check_fbsurvey_generated_covidalert_vs_working(covidalert.df.to.test, specified.signal, geo, start.checking.date-1L) + # # } + + + + smooth_option_regex = re.compile(r'([^_]+)') + + kroc = 0 + + # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). + for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): + + m = smooth_option_regex.match(sig) + smooth_option = m.group(1) + + if smooth_option not in ('raw', 'smoothed'): + smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' - column_names = ["geo_id", "val", "se", "sample_size", "time_value"] + #recent_df.set_index("time_value", inplace = True) + print("Printing recent_df scenes:", recent_df.shape) + print(recent_df) + for checking_date in date_list: + # -recent- dataframe run backwards from the checking_date + recent_end_date = checking_date - recent_lookbehind + recent_begin_date = checking_date - max_check_lookbehind + recent_api_df = covidcast.signal(data_source, sig, recent_begin_date, recent_end_date, geo) + + # Replace None with NA to make numerical manipulation easier. + recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) + + # Rename columns. + recent_api_df.rename(columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace = True) + recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) + + column_names = ["geo_id", "val", "se", "sample_size", "time_value"] - recent_api_df = recent_api_df.reindex(columns=column_names) + recent_api_df = recent_api_df.reindex(columns=column_names) - if (recent_df["se"].isnull().mean() > 0.5): - print('Recent se values are >50% NA') + if (recent_df["se"].isnull().mean() > 0.5): + self.raised.append('Recent se values are >50% NA') - if sanity_check_rows_per_day: - check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) + if sanity_check_rows_per_day: + self.check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) - if sanity_check_value_diffs: - check_avg_val_diffs(recent_df, recent_api_df, smooth_option) + if sanity_check_value_diffs: + self.check_avg_val_diffs(recent_df, recent_api_df, smooth_option) + + kroc += 1 + if kroc == 2: + break + + self.exit() + - kroc += 1 - if kroc == 2: - break - sys.exit() - + def exit(self): + if self.raised: + for message in self.raised: + print(message) + sys.exit(1) + else: + sys.exit(0) \ No newline at end of file From a7e1d1101094824c33175d0935269dac56bb245a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 25 Sep 2020 19:32:38 -0400 Subject: [PATCH 050/151] Print error messages one per line --- validator/delphi_validator/validate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index e22728d63..515c2a6c2 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -453,9 +453,11 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera def exit(self): if self.raised: + print(len(self.raised), "messages") + for message in self.raised: print(message) sys.exit(1) else: - sys.exit(0) \ No newline at end of file + sys.exit(0) From 2f1f743878486621f5fd7dcd903ba02b6ad1ad35 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 28 Sep 2020 13:13:10 -0400 Subject: [PATCH 051/151] [wip] Added some docstrings --- validator/delphi_validator/validate.py | 115 +++++++++++++++++-------- 1 file changed, 77 insertions(+), 38 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 515c2a6c2..815da512e 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -22,22 +22,47 @@ class ValidationError(Exception): """ Error raised when validation check fails. """ def __init__(self, expression, message): + """ + Arguments: + - expression: relevant variables to message, e.g., if a date doesn't pass a check, provide a list of the date and the filename of the CSV it originated from + - message: str explaining why an error was raised + """ self.expression = expression self.message = message class Validator(object): + """ Class containing validation() function and supporting functions. Stores a list of all raised errors and warnings. """ def __init__(self): self.raised = [] def make_date_filter(self, start_date, end_date): + """ + Create a function. + + Arguments: + - start_date: datetime date object + - end_date: datetime date object + + Returns: + - None + """ + # Convert dates from datetime format to int. start_code = int(start_date.strftime("%Y%m%d")) end_code = int(end_date.strftime("%Y%m%d")) + + def f(filename, match): - if not match: return False + """ + + """ + if not match: + return False + code = int(match.groupdict()['date']) return code > start_code and code < end_code + return f def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generation_date): @@ -189,11 +214,19 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option= thres = switcher.get(weighted_option, lambda: "Invalid weighting option") if (max_date < generation_date - thres): - self.raised.append(ValidationError(None, "latest date of generated file seems too long ago")) + self.raised.append(ValidationError(None, "most recent date of generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, generation_date): if (max_date < generation_date - timedelta(days=1)): - self.raised.append(ValidationError(None, "latest date of generated file seems too recent")) + self.raised.append(ValidationError(None, "most recent date of generated file seems too recent")) + + def check_max_date_vs_reference(self, df_to_test, df_to_reference): + if df_to_test["date"].max() < df_to_reference["date"].max(): + self.raised.append(ValidationError((df_to_test["date"].max(), df_to_reference["date"].max()), + 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + + 'to handle this case, and this situation may indicate that something locally is out of date,' + + 'or, if the local working files have already been compared against the reference,' + + 'that there is a bug somewhere')) def reldiff_by_min(self, x, y): return (x - y) / min(x,y) @@ -313,12 +346,13 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera all_frames = [] - # TODO: What does unweighted vs weighted mean? 7dav vs not? Best place for these checks? + # TODO: What does unweighted vs weighted mean? See reference here: https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L207 self.check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') self.check_max_allowed_max_date(end_date, generation_date) - # First, check file formats self.check_missing_dates(validate_files, start_date, end_date) + + # For every file, read in and do some basic format and value checks. for filename, match in validate_files: df = load_csv(join(export_dir, filename)) @@ -360,46 +394,51 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera semirecent_lookbehind = timedelta(days=7) - # ## New from reference code. - # # TODO: Check recent data against semirecent and API data. - # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, - # max(all_frames["date"]) - max_check_lookbehind + 1) - # end_checking_date = max(all_frames["date"]) + ## New from reference code. + # TODO: Check recent data against both semirecent and API data. + start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, + max(all_frames["date"]) - max_check_lookbehind + 1) + end_checking_date = max(all_frames["date"]) + + if (start_checking_date > end_checking_date): + self.raised.append(ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks")) - # if (start_checking_date > end_checking_date): - # self.raised.append(ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks")) + # Loop over all sets of dates for a given CSV. + for checking_date in range(start_checking_date, end_checking_date): + # TODO: Implement get_known_irregularities(). Add irregularity flags to other checks. + known_irregularities = get_known_irregularities(checking_date, filename) - # # Loop over all sets of dates for a given CSV. - # for checking_date in range(start_checking_date, end_checking_date): - # known_irregularities = get_known_irregularities(checking_date, filename) + recent_cutoff_date = checking_date - recent_lookbehind + 1 + semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 - # recent_cutoff_date = checking_date - recent_lookbehind + 1 - # semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 + recent_df_to_test = all_frames.query('date <= @checking_date & date >= @recent_cutoff_date') + semirecent_df_to_test = all_frames.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') - # recent_df_to_test = df_to_test.query('date <= @checking_date & date >= @recent_cutoff_date') - # semirecent_df_to_test = df_to_test.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') + if (recent_df_to_test["se"].isnull().mean() > 0.5): + self.raised.append('Recent se values are >50% NA') - # if (recent_df_to_test["se"].isnull().mean() > 0.5): - # self.raised.append('Recent se values are >50% NA') + self.check_max_date_vs_reference(recent_df_to_test, semirecent_df_to_test) - # if sanity_check_rows_per_day: - # self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) + if sanity_check_rows_per_day: + self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) - # # TODO: Get smooth_option from CSV name. - # if sanity_check_value_diffs: - # self.check_avg_val_diffs(recent_df_to_test, semirecent_df_to_test, smooth_option) + # TODO: Get smooth_option from CSV name. + if sanity_check_value_diffs: + self.check_avg_val_diffs(recent_df_to_test, semirecent_df_to_test, smooth_option) - # # ## Compare vs. covidcast in farther-back days: - # # if (max(covidalert.df.to.test[["date"]]) < max(covidcast.reference.df[["date"]])) { - # # print(nameformat) - # # print(max(covidalert.df.to.test[["date"]])) - # # print(max(covidcast.reference.df[["date"]])) - # # stop ('covidcast reference df has days beyond the max date in the =covidalert.df.to.test=; checks are not constructed to handle this case, and this situation may indicate that something locally is out-of-date, or, if the local working covidalert files have already been compared against covidcast, that there is a bug somewhere') - # # } - # # if (check.vs.working) { - # # check_fbsurvey_generated_covidalert_vs_working(covidalert.df.to.test, specified.signal, geo, start.checking.date-1L) - # # } + if check_vs_working: + pass + ## Goal is to see if past data has been changed. + # check_fbsurvey_generated_covidalert_vs_working(covidalert.df.to.test, specified.signal, geo, start.checking.date-1L) + # Get all files for a given indicator. + # get_working_covidalert_daily("fb-survey", signal, geo.type, fbsurvey.extra.listcolspec) + # filter by date to keep only rows with date <= end_comparison_date, and date >= min date seen in both reference and test data + # if, after filtering, the reference data has 0 rows, stop with a message "not any reference data in the reference period" + + # full join test and reference data on geo_id and date + # create new columns checking whether a given field (val, se, sample_size) is missing in both test and reference data, or if it is within a set tolerance + # filter data to keep only rows where one of the new columns is true. if df is not empty, raise error that "mismatches between less recent data in data frame to test and corresponding reference data frame" smooth_option_regex = re.compile(r'([^_]+)') @@ -427,12 +466,12 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # Replace None with NA to make numerical manipulation easier. recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) - # Rename columns. + # Rename columns to match those in df_to_test. recent_api_df.rename(columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace = True) recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) + # Reorder columns. column_names = ["geo_id", "val", "se", "sample_size", "time_value"] - recent_api_df = recent_api_df.reindex(columns=column_names) if (recent_df["se"].isnull().mean() > 0.5): From cb374cad55bb4db114a5c0f8423491f898cc24d3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 29 Sep 2020 12:24:50 -0400 Subject: [PATCH 052/151] [wip] Added docstrings to validate.py functions --- validator/delphi_validator/validate.py | 217 +++++++++++++++++-------- 1 file changed, 150 insertions(+), 67 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 815da512e..e22321f30 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -11,6 +11,7 @@ import pdb +# Recognized geo types. negated_regex_dict = { 'county': '^(?!\d{5}).*$', 'hrr': '^(?!\d{1,3}).*$', @@ -39,7 +40,7 @@ def __init__(self): def make_date_filter(self, start_date, end_date): """ - Create a function. + Create a function to return a boolean of whether a filename of appropriate format contains a date within the specified date range. Arguments: - start_date: datetime date object @@ -55,12 +56,23 @@ def make_date_filter(self, start_date, end_date): def f(filename, match): """ + Return a boolean of whether a filename of appropriate format contains a date within the specified date range. + + Arguments: + - filename: str + - match: regex match object based on filename_regex + Returns: + - boolean """ + # If regex match doesn't exist, current filename is not an appropriately formatted source data file. if not match: return False + # Convert date found in CSV name to int. code = int(match.groupdict()['date']) + + # Return boolean True if current file is within the defined date range. return code > start_code and code < end_code return f @@ -70,8 +82,8 @@ def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generatio Perform some automated format & sanity checks of inputs. Arguments: - - df_to_test: pandas dataframe of CSV source data - - nameformat: CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" + - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test @@ -101,7 +113,7 @@ def check_bad_geo_id(self, df_to_test, geo_type): Arguments: - df_to_test: pandas dataframe of CSV source data - - geo_type: string from CSV name specifying geo type (state, county, etc) of data + - geo_type: string from CSV name specifying geo type (state, county, msa, hrr) of data Returns: - None @@ -110,37 +122,56 @@ def check_bad_geo_id(self, df_to_test, geo_type): self.raised.append(ValidationError(geo_type,"Unrecognized geo type")) def find_all_unexpected_geo_ids(df_to_test, negated_regex): + """ + Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. + """ unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] if(len(unexpected_geos) > 0): self.raised.append(ValidationError(unexpected_geos,"Non-conforming geo_ids exist!")) find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - def check_missing_dates(self, daily_filenames, sdate, edate): + def check_missing_dates(self, daily_filenames, sdate, edate): + """ + Check for missing dates between the specified start and end dates. + + Arguments: + - daily_filenames: list of CSV source data filenames. + - sdate: start date, in datetime format + - edate: end date, in datetime format + + Returns: + - None + """ number_of_dates = edate - sdate + timedelta(days=1) + + # Create set of all expected dates. date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} unique_dates = set() + # Add each date seen in CSV names to set. for daily_filename in daily_filenames: unique_dates.add(datetime.strptime(daily_filename[0][0:8], '%Y%m%d')) + # Diff expected and observed dates. check_dateholes = list(date_seq.difference(unique_dates)) check_dateholes.sort() if check_dateholes: - self.raised.append((check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) + self.raised.append(ValidationError(check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) def check_bad_val(self, df_to_test, signal_type): """ Check value field for validity. Arguments: - - df_to_test: pandas dataframe of CSV source data + - df_to_test: pandas dataframe of a single CSV of source data - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data Returns: - None """ + # Determine if signal is a proportion or percent proportion_option = True if 'prop' in signal_type or 'pct' in signal_type else False if proportion_option: @@ -159,11 +190,12 @@ def check_bad_se(self, df_to_test, missing_se_allowed): Arguments: - df_to_test: pandas dataframe of CSV source data - - missing_se_allowed: boolean specified in params.json + - missing_se_allowed: boolean indicating if missing standard errors should raise an exception or not Returns: - None """ + # Add a new se_upper_limit column. df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) df_to_test['se']= df_to_test['se'].round(3) @@ -173,16 +205,17 @@ def check_bad_se(self, df_to_test, missing_se_allowed): if (df_to_test['se'].isnull().values.any()): self.raised.append(ValidationError(None, "se must not be NA")) + # Find rows not in the allowed range for se. result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - self.raised.append(ValidationError(None, "se must be in (0,min(50,val*(1+eps))]")) + self.raised.append(ValidationError(None, "se must be in (0, min(50,val*(1+eps))]")) elif missing_se_allowed: result = df_to_test.query('~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: - self.raised.append(ValidationError(None, "se must be NA or in (0,min(50,val*(1+eps))]")) + self.raised.append(ValidationError(None, "se must be NA or in (0, min(50,val*(1+eps))]")) result = df_to_test.query('(val == 0) & (se == 0)') @@ -190,10 +223,22 @@ def check_bad_se(self, df_to_test, missing_se_allowed): self.raised.append(ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_size_allowed): + """ + Check sample sizes for validity. + + Arguments: + - df_to_test: pandas dataframe of a single CSV of source data + - minimum_sample_size: int + - missing_sample_size_allowed: boolean indicating if missing sample size should raise an exception or not + + Returns: + - None + """ if not missing_sample_size_allowed: if (df_to_test['sample_size'].isnull().values.any()): self.raised.append(ValidationError(None, "sample_size must not be NA")) + # Find rows with sample size less than minimum allowed result = df_to_test.query('(sample_size < @minimum_sample_size)') if not result.empty: @@ -206,21 +251,52 @@ def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_ self.raised.append(ValidationError(None, "sample size must be NA or >= {minimum_sample_size}")) def check_min_allowed_max_date(self, max_date, generation_date, weighted_option='unweighted'): + """ + Check if time since data was generated is reasonable or too long ago. + + Arguments: + - max_date: date of most recent data to be validated; datetime format. + - generation_date: date data to test was generated; datetime format. + - weighted_option: str; selects the "reasonable" threshold + + Returns: + - None + """ switcher = { 'unweighted': timedelta(days=1), 'weighted': timedelta(days=4) } - # Get the function from switcher dictionary + # Get the setting from switcher dictionary thres = switcher.get(weighted_option, lambda: "Invalid weighting option") if (max_date < generation_date - thres): self.raised.append(ValidationError(None, "most recent date of generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, generation_date): + """ + Check if time since data was generated is reasonable or too recent. + + Arguments: + - max_date: date of most recent data to be validated; datetime format. + - generation_date: date data to test was generated; datetime format. + + Returns: + - None + """ if (max_date < generation_date - timedelta(days=1)): self.raised.append(ValidationError(None, "most recent date of generated file seems too recent")) def check_max_date_vs_reference(self, df_to_test, df_to_reference): + """ + Check if reference data is more recent than test data. + + Arguments: + - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + + Returns: + - None + """ if df_to_test["date"].max() < df_to_reference["date"].max(): self.raised.append(ValidationError((df_to_test["date"].max(), df_to_reference["date"].max()), 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + @@ -229,6 +305,9 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference): 'that there is a bug somewhere')) def reldiff_by_min(self, x, y): + """ + Calculate relative difference between two numbers. + """ return (x - y) / min(x,y) def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_list, sig, geo): @@ -236,12 +315,12 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_li Compare number of obervations per day in test dataframe vs reference dataframe. Arguments: - - df_to_test: pandas dataframe of "recent" CSV source data + - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - checking_date - - date_list - - sig - - geo + - date_list: list of dates to check + - sig: str; signal name as in the CSV name + - geo: str; geo type name (county, msa, hrr, state) as in the CSV name Returns: - None @@ -250,14 +329,14 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_li reference_rows_per_reporting_day = df_to_reference.shape[0] / len(date_list) if(abs(self.reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35): - self.raised.append(ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs recent window of data)")) + self.raised.append(ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs test data)")) def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): """ - Compare average values in test dataframe vs reference dataframe. + Compare average values for each variable in test dataframe vs reference dataframe. Arguments: - - df_to_test: pandas dataframe of "recent" CSV source data + - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") @@ -273,11 +352,16 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): df_all = pd.concat([df_to_test, df_to_reference]) - # For each variable type (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. - df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"]).pivot(index=("geo_id", "variable"), columns="type", values="value").reset_index(("geo_id","variable")).dropna().assign( + # For each variable (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. + df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"] + ).pivot(index=("geo_id", "variable"), columns="type", values="value" + ).reset_index(("geo_id","variable") + ).dropna( + ).assign( type_diff=lambda x: x["test"] - x["reference"], abs_type_diff=lambda x: abs(x["type_diff"]) - ).groupby("variable", as_index=False).agg( + ).groupby("variable", as_index=False + ).agg( mean_type_diff=("type_diff", "mean"), mean_abs_type_diff=("abs_type_diff", "mean"), mean_test_var=("test", "mean"), @@ -290,7 +374,6 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes).T - smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) switcher = { @@ -301,7 +384,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): # Get the selected thresholds from switcher dictionary thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") - # Check if the calculated mean differences are high, compared to the thresholds. + # Check if the calculated mean differences are high compared to the thresholds. mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ((df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() @@ -313,17 +396,17 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.')) - def validate(self, export_dir, start_date, end_date, data_source, params, generation_date = date.today()): + def validate(self, export_dir, start_date, end_date, data_source, params={}, generation_date = date.today()): """ Runs all data checks. Arguments: - + - export_dir: path to data CSVs + - start_date: beginning date of data to check + - end_date: end date of data to check + - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html + - params: dictionary of user settings; if empty, defaults will be used - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test - - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - - sanity_check_rows_per_day - - sanity_check_value_diffs: - - check_vs_working Returns: - None @@ -346,7 +429,7 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera all_frames = [] - # TODO: What does unweighted vs weighted mean? See reference here: https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L207 + # TODO: Make weight_option based on signal name for Facebook data. See reference here: https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L207 self.check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') self.check_max_allowed_max_date(end_date, generation_date) @@ -361,8 +444,8 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera self.check_bad_val(df, match.groupdict()['signal']) self.check_bad_se(df, missing_se_allowed) self.check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) - # Get geo_type, date, and signal name as specified by CSV name. + df['geo_type'] = match.groupdict()['geo_type'] df['date'] = match.groupdict()['date'] df['signal'] = match.groupdict()['signal'] @@ -376,8 +459,8 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # Get all expected combinations of geo_type and signal. geo_sig_cmbo = get_geo_sig_cmbo(data_source) - # Get list of dates we expect to see in the CSV data. - date_slist = df['date'].unique().tolist() + # Get list of dates we expect to see in all the CSV data. + date_slist = all_frames['date'].unique().tolist() date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) # Get list of CSV names. @@ -394,51 +477,51 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera semirecent_lookbehind = timedelta(days=7) - ## New from reference code. - # TODO: Check recent data against both semirecent and API data. - start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, - max(all_frames["date"]) - max_check_lookbehind + 1) - end_checking_date = max(all_frames["date"]) + # ## New from reference code. + # # TODO: Check recent data against both semirecent and API data. + # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, + # max(all_frames["date"]) - max_check_lookbehind + 1) + # end_checking_date = max(all_frames["date"]) - if (start_checking_date > end_checking_date): - self.raised.append(ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks")) + # if (start_checking_date > end_checking_date): + # self.raised.append(ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks")) - # Loop over all sets of dates for a given CSV. - for checking_date in range(start_checking_date, end_checking_date): - # TODO: Implement get_known_irregularities(). Add irregularity flags to other checks. - known_irregularities = get_known_irregularities(checking_date, filename) + # # Loop over all sets of dates for a given CSV. + # for checking_date in range(start_checking_date, end_checking_date): + # # TODO: Implement get_known_irregularities(). Add irregularity flags to other checks. + # known_irregularities = get_known_irregularities(checking_date, filename) - recent_cutoff_date = checking_date - recent_lookbehind + 1 - semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 + # recent_cutoff_date = checking_date - recent_lookbehind + 1 + # semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 - recent_df_to_test = all_frames.query('date <= @checking_date & date >= @recent_cutoff_date') - semirecent_df_to_test = all_frames.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') + # recent_df_to_test = all_frames.query('date <= @checking_date & date >= @recent_cutoff_date') + # semirecent_df_to_test = all_frames.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') - if (recent_df_to_test["se"].isnull().mean() > 0.5): - self.raised.append('Recent se values are >50% NA') + # if (recent_df_to_test["se"].isnull().mean() > 0.5): + # self.raised.append('Recent se values are >50% NA') - self.check_max_date_vs_reference(recent_df_to_test, semirecent_df_to_test) + # self.check_max_date_vs_reference(recent_df_to_test, semirecent_df_to_test) - if sanity_check_rows_per_day: - self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) + # if sanity_check_rows_per_day: + # self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) - # TODO: Get smooth_option from CSV name. - if sanity_check_value_diffs: - self.check_avg_val_diffs(recent_df_to_test, semirecent_df_to_test, smooth_option) + # # TODO: Get smooth_option from CSV name. + # if sanity_check_value_diffs: + # self.check_avg_val_diffs(recent_df_to_test, semirecent_df_to_test, smooth_option) - if check_vs_working: - pass + # if check_vs_working: + # pass - ## Goal is to see if past data has been changed. - # check_fbsurvey_generated_covidalert_vs_working(covidalert.df.to.test, specified.signal, geo, start.checking.date-1L) - # Get all files for a given indicator. - # get_working_covidalert_daily("fb-survey", signal, geo.type, fbsurvey.extra.listcolspec) - # filter by date to keep only rows with date <= end_comparison_date, and date >= min date seen in both reference and test data - # if, after filtering, the reference data has 0 rows, stop with a message "not any reference data in the reference period" + # ## Goal is to see if past data has been changed. + # # check_fbsurvey_generated_covidalert_vs_working(covidalert.df.to.test, specified.signal, geo, start.checking.date-1L) + # # Get all files for a given indicator. + # # get_working_covidalert_daily("fb-survey", signal, geo.type, fbsurvey.extra.listcolspec) + # # filter by date to keep only rows with date <= end_comparison_date, and date >= min date seen in both reference and test data + # # if, after filtering, the reference data has 0 rows, stop with a message "not any reference data in the reference period" - # full join test and reference data on geo_id and date - # create new columns checking whether a given field (val, se, sample_size) is missing in both test and reference data, or if it is within a set tolerance - # filter data to keep only rows where one of the new columns is true. if df is not empty, raise error that "mismatches between less recent data in data frame to test and corresponding reference data frame" + # # full join test and reference data on geo_id and date + # # create new columns checking whether a given field (val, se, sample_size) is missing in both test and reference data, or if it is within a set tolerance + # # filter data to keep only rows where one of the new columns is true. if df is not empty, raise error that "mismatches between less recent data in data frame to test and corresponding reference data frame" smooth_option_regex = re.compile(r'([^_]+)') From a2a00f1b71c241fb926e2a94396597b148059159 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 29 Sep 2020 12:54:38 -0400 Subject: [PATCH 053/151] [wip] Added docstrings and comments to datafetcher.py --- validator/delphi_validator/datafetcher.py | 61 +++++++++++++++++++++-- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 608efac05..8b74157b3 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -14,6 +14,18 @@ def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): + """ + Gets list of filenames in data folder and list of expected geo type-signal type combinations. + + Arguments: + - path: path to data CSVs + - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html + - date_slist: list of dates (formatted as strings) to check + + Returns: + - list of filenames + - list of geo type-signal type combinations that we expect to see + """ geo_sig_cmbo = get_geo_sig_cmbo(data_source) for cmb in geo_sig_cmbo: @@ -24,14 +36,17 @@ def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): def get_geo_sig_cmbo(data_source): + """ + Get list of geo type-signal type combinations that we expect to see, based on combinations reported available by Covidcast metadata. + """ meta = covidcast.metadata() source_meta = meta[meta['data_source']==data_source] unique_signals = source_meta['signal'].unique().tolist() unique_geotypes = source_meta['geo_type'].unique().tolist() if data_source == 'fb-survey': - ##### Currently metadata returns --*community*-- signals that don't get generated - ##### in the new fb-pipeline. Seiving them out for now. + # Currently metadata returns --*community*-- signals that don't get generated + # in the new fb-pipeline. Seiving them out for now. # TODO: Include weighted whh_cmnty_cli and wnohh_cmnty_cli for sig in unique_signals: if "community" in sig: @@ -44,6 +59,15 @@ def get_geo_sig_cmbo(data_source): def read_filenames(path): + """ + Return a list of tuples of every filename and regex match to the CSV filename format in the specified directory. + + Arguments: + - path: path to the directory containing CSV data files. + + Returns: + - list of tuples + """ daily_filenames = [ (f, filename_regex.match(f)) for f in listdir(path) if isfile(join(path, f))] return daily_filenames @@ -58,24 +82,48 @@ def read_relevant_date_filenames(data_path, date_slist): return filenames def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): + """ + Generator that assembles data within the specified date range for a given geo_sig_cmbo. + + Arguments: + - geo_sig_cmbo: list of geo type-signal type combinations that we expect to see, based on combinations reported available by Covidcast metadata + - data_folder: path to the directory containing CSV data files. + - filenames: list of filenames + - date_slist: list of dates (formatted as strings) to check + + Returns: + - dataframe containing data for all dates in date_slist for a given geo type-signal type combination + - relevant geo type (str) + - relevant signal type (str) + """ for geo_sig in geo_sig_cmbo: df_list = list() + # Get all filenames for this geo_type and signal_type files = list(filter(lambda x: geo_sig[0] in x and geo_sig[1] in x, filenames)) + if (len(files) == 0): print("FILE_NOT_FOUND: File with geo_type:", geo_sig[0], " and signal:", geo_sig[1], " does not exist!") yield pd.DataFrame(), geo_sig[0], geo_sig[1] continue + + # Load data from all found files. for f in files: df = load_csv(join(data_folder, f)) for dt in date_slist: + + # Add data's date, from CSV name, as new column if f.find(dt) != -1: gen_dt = datetime.strptime(dt, '%Y%m%d') df['time_value'] = gen_dt df_list.append(df) + yield pd.concat(df_list), geo_sig[0], geo_sig[1] def load_csv(path): + """ + Load CSV with specified column types. + """ return pd.read_csv( path, dtype={ @@ -86,12 +134,15 @@ def load_csv(path): }) def fetch_daily_data(data_source, survey_date, geo_type, signal): - data_to_validate = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) - if not isinstance(data_to_validate, pd.DataFrame): + """ + Get API data for a specified date, source, signal, and geo type. + """ + data_to_reference = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) + if not isinstance(data_to_reference, pd.DataFrame): custom_msg = "Error fetching data on" + str(survey_date)+ \ "for data source:" + data_source + \ ", signal-type:"+ signal + \ ", geography-type:" + geo_type raise APIDataFetchError(custom_msg) - return data_to_validate + return data_to_reference From 23d39bcc91370549dd4739ad08eaac75678c7c84 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 29 Sep 2020 14:21:16 -0400 Subject: [PATCH 054/151] pylint recommended improvements --- validator/README.md | 4 +- validator/delphi_validator/datafetcher.py | 55 +++--- validator/delphi_validator/errors.py | 5 + validator/delphi_validator/run.py | 5 +- validator/delphi_validator/validate.py | 197 +++++++++++----------- 5 files changed, 145 insertions(+), 121 deletions(-) diff --git a/validator/README.md b/validator/README.md index bcac88fc1..af0af1071 100644 --- a/validator/README.md +++ b/validator/README.md @@ -2,8 +2,8 @@ The validator performs two main tasks: 1) Sanity checks on daily data generated from a pipeline of specific data - source. -2) Its does a comparative analysis with recent data from the API + source. +2) Its does a comparative analysis with recent data from the API to detect any anomalies such as spikes, significant value differences The validator validates against daily data thats already written in the disk diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 8b74157b3..1f265833c 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -1,14 +1,18 @@ -from os import listdir, stat -from os.path import isfile, join -import platform -import covidcast -import pandas as pd -from datetime import date, datetime, timedelta -from .errors import APIDataFetchError +# -*- coding: utf-8 -*- +""" +Functions to get CSV filenames and data. +""" + import re +from os import listdir +from os.path import isfile, join +from datetime import datetime from typing import List -import json from itertools import product +import pandas as pd + +import covidcast +from .errors import APIDataFetchError filename_regex = re.compile(r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') @@ -16,28 +20,30 @@ def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): """ Gets list of filenames in data folder and list of expected geo type-signal type combinations. - + Arguments: - path: path to data CSVs - - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html + - data_source: str; data source name, one of + https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html - date_slist: list of dates (formatted as strings) to check Returns: - list of filenames - - list of geo type-signal type combinations that we expect to see + - list of geo type-signal type combinations that we expect to see """ - geo_sig_cmbo = get_geo_sig_cmbo(data_source) + geo_sig_cmbo = get_geo_sig_cmbo(data_source) for cmb in geo_sig_cmbo: print(cmb) - filenames = read_relevant_date_filenames(data_folder, date_slist[0]) + filenames = read_relevant_date_filenames(path, date_slist[0]) return filenames, geo_sig_cmbo def get_geo_sig_cmbo(data_source): """ - Get list of geo type-signal type combinations that we expect to see, based on combinations reported available by Covidcast metadata. + Get list of geo type-signal type combinations that we expect to see, based on + combinations reported available by Covidcast metadata. """ meta = covidcast.metadata() source_meta = meta[meta['data_source']==data_source] @@ -51,7 +57,7 @@ def get_geo_sig_cmbo(data_source): for sig in unique_signals: if "community" in sig: unique_signals.remove(sig) - + geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) print("Number of mixed types:", len(geo_sig_cmbo)) @@ -72,7 +78,17 @@ def read_filenames(path): return daily_filenames def read_relevant_date_filenames(data_path, date_slist): - all_files = [f for f in listdir(path) if isfile(join(data_path, f))] + """ + Return a list of tuples of every filename in the specified directory if the file is in the specified date range. + + Arguments: + - data_path: path to the directory containing CSV data files. + - date_slist: list of dates (formatted as strings) to check + + Returns: + - list + """ + all_files = [f for f in listdir(data_path) if isfile(join(data_path, f))] filenames = list() for fl in all_files: @@ -100,9 +116,9 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): df_list = list() # Get all filenames for this geo_type and signal_type - files = list(filter(lambda x: geo_sig[0] in x and geo_sig[1] in x, filenames)) + files = [file for file in filenames if geo_sig[0] in file and geo_sig[1] in file] - if (len(files) == 0): + if len(files) == 0: print("FILE_NOT_FOUND: File with geo_type:", geo_sig[0], " and signal:", geo_sig[1], " does not exist!") yield pd.DataFrame(), geo_sig[0], geo_sig[1] continue @@ -116,7 +132,7 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): if f.find(dt) != -1: gen_dt = datetime.strptime(dt, '%Y%m%d') df['time_value'] = gen_dt - df_list.append(df) + df_list.append(df) yield pd.concat(df_list), geo_sig[0], geo_sig[1] @@ -145,4 +161,3 @@ def fetch_daily_data(data_source, survey_date, geo_type, signal): ", geography-type:" + geo_type raise APIDataFetchError(custom_msg) return data_to_reference - diff --git a/validator/delphi_validator/errors.py b/validator/delphi_validator/errors.py index 932b1c1f3..7ed08db36 100644 --- a/validator/delphi_validator/errors.py +++ b/validator/delphi_validator/errors.py @@ -1,3 +1,8 @@ +# -*- coding: utf-8 -*- +""" +Custom validator exceptions. +""" + class APIDataFetchError(Exception): """Exception raised for errors during validation. diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 1ac7a7c6a..12883322b 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -1,15 +1,12 @@ # -*- coding: utf-8 -*- -"""Functions to call when running the function. +"""Functions to call when running the tool. This module should contain a function called `run_module`, that is executed when the module is run with `python -m delphi_validator`. """ from datetime import datetime -import numpy as np -import pandas as pd from delphi_utils import read_params from .validate import Validator -from .datafetcher import read_filenames def run_module(): diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index e22321f30..eca355b61 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -1,15 +1,19 @@ +# -*- coding: utf-8 -*- +""" +Tools to validate CSV source data, including various check methods. +""" + import sys -from os.path import join import re +import math +from os.path import join +from datetime import date, datetime, timedelta import pandas as pd import numpy as np -from pathlib import Path -from itertools import product -from datetime import date, datetime, timedelta -from .datafetcher import * -import math -import pdb +import covidcast +from .datafetcher import load_csv, read_geo_sig_cmbo_files, read_filenames, get_geo_sig_cmbo, filename_regex + # Recognized geo types. negated_regex_dict = { @@ -20,6 +24,13 @@ 'national': '(?!usa).*$' } + +def reldiff_by_min(x, y): + """ + Calculate relative difference between two numbers. + """ + return (x - y) / min(x,y) + class ValidationError(Exception): """ Error raised when validation check fails. """ def __init__(self, expression, message): @@ -32,7 +43,7 @@ def __init__(self, expression, message): self.message = message -class Validator(object): +class Validator(): """ Class containing validation() function and supporting functions. Stores a list of all raised errors and warnings. """ def __init__(self): @@ -41,29 +52,28 @@ def __init__(self): def make_date_filter(self, start_date, end_date): """ Create a function to return a boolean of whether a filename of appropriate format contains a date within the specified date range. - + Arguments: - start_date: datetime date object - end_date: datetime date object Returns: - - None + - None """ # Convert dates from datetime format to int. start_code = int(start_date.strftime("%Y%m%d")) end_code = int(end_date.strftime("%Y%m%d")) - def f(filename, match): + def f(match): """ Return a boolean of whether a filename of appropriate format contains a date within the specified date range. - + Arguments: - - filename: str - - match: regex match object based on filename_regex + - match: regex match object based on filename_regex applied to a filename str Returns: - - boolean + - boolean """ # If regex match doesn't exist, current filename is not an appropriately formatted source data file. if not match: @@ -72,15 +82,15 @@ def f(filename, match): # Convert date found in CSV name to int. code = int(match.groupdict()['date']) - # Return boolean True if current file is within the defined date range. - return code > start_code and code < end_code + # Return boolean True if current file date "code" is within the defined date range. + return start_code < code < end_code return f def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generation_date): """ Perform some automated format & sanity checks of inputs. - + Arguments: - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" @@ -88,15 +98,15 @@ def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generatio - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test Returns: - - None + - None """ - - if (not isinstance(max_check_lookbehind, timedelta)): + + if not isinstance(max_check_lookbehind, timedelta): self.raised.append(ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be of type datetime.timedelta")) if( not isinstance(generation_date, date) or generation_date > date.today()): - self.raised.append(ValidationError(generation_date, f"generation.date ({generation.date}) must be a datetime.date type and not in the future.")) - + self.raised.append(ValidationError(generation_date, f"generation_date must be a datetime.date type and not in the future.")) + pattern_found = filename_regex.match(nameformat) if (not nameformat or not pattern_found): self.raised.append(ValidationError(nameformat, 'nameformat ({nameformat}) not recognized')) @@ -110,38 +120,38 @@ def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generatio def check_bad_geo_id(self, df_to_test, geo_type): """ Check validity of geo type and values, according to regex pattern. - + Arguments: - df_to_test: pandas dataframe of CSV source data - geo_type: string from CSV name specifying geo type (state, county, msa, hrr) of data Returns: - - None + - None """ if geo_type not in negated_regex_dict: self.raised.append(ValidationError(geo_type,"Unrecognized geo type")) - + def find_all_unexpected_geo_ids(df_to_test, negated_regex): """ Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. """ unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] - if(len(unexpected_geos) > 0): + if len(unexpected_geos) > 0: self.raised.append(ValidationError(unexpected_geos,"Non-conforming geo_ids exist!")) - + find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - def check_missing_dates(self, daily_filenames, sdate, edate): + def check_missing_dates(self, daily_filenames, sdate, edate): """ Check for missing dates between the specified start and end dates. - + Arguments: - daily_filenames: list of CSV source data filenames. - sdate: start date, in datetime format - edate: end date, in datetime format Returns: - - None + - None """ number_of_dates = edate - sdate + timedelta(days=1) @@ -156,44 +166,44 @@ def check_missing_dates(self, daily_filenames, sdate, edate): # Diff expected and observed dates. check_dateholes = list(date_seq.difference(unique_dates)) check_dateholes.sort() - + if check_dateholes: self.raised.append(ValidationError(check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) def check_bad_val(self, df_to_test, signal_type): """ Check value field for validity. - + Arguments: - df_to_test: pandas dataframe of a single CSV of source data - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data Returns: - - None + - None """ # Determine if signal is a proportion or percent - proportion_option = True if 'prop' in signal_type or 'pct' in signal_type else False + proportion_option = bool('prop' in signal_type or 'pct' in signal_type) if proportion_option: - if (not df_to_test[(df_to_test['val'] > 100)].empty): + if not df_to_test[(df_to_test['val'] > 100)].empty: self.raised.append(ValidationError(signal_type, "val column can't have any cell greater than 100")) - if (df_to_test['val'].isnull().values.any()): + if df_to_test['val'].isnull().values.any(): self.raised.append(ValidationError(None,"val column can't have any cell that is NA")) - - if (not df_to_test[(df_to_test['val'] < 0)].empty): + + if not df_to_test[(df_to_test['val'] < 0)].empty: self.raised.append(ValidationError(None,"val column can't have any cell smaller than 0")) def check_bad_se(self, df_to_test, missing_se_allowed): """ Check standard errors for validity. - + Arguments: - df_to_test: pandas dataframe of CSV source data - missing_se_allowed: boolean indicating if missing standard errors should raise an exception or not Returns: - - None + - None """ # Add a new se_upper_limit column. df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) @@ -202,47 +212,47 @@ def check_bad_se(self, df_to_test, missing_se_allowed): df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) if not missing_se_allowed: - if (df_to_test['se'].isnull().values.any()): + if df_to_test['se'].isnull().values.any(): self.raised.append(ValidationError(None, "se must not be NA")) - + # Find rows not in the allowed range for se. result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - self.raised.append(ValidationError(None, "se must be in (0, min(50,val*(1+eps))]")) + self.raised.append(ValidationError(None, "se must be in (0, min(50,val*(1+eps))]")) elif missing_se_allowed: result = df_to_test.query('~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: self.raised.append(ValidationError(None, "se must be NA or in (0, min(50,val*(1+eps))]")) - + result = df_to_test.query('(val == 0) & (se == 0)') if not result.empty: - self.raised.append(ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) + self.raised.append(ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_size_allowed): """ Check sample sizes for validity. - + Arguments: - df_to_test: pandas dataframe of a single CSV of source data - minimum_sample_size: int - missing_sample_size_allowed: boolean indicating if missing sample size should raise an exception or not Returns: - - None + - None """ if not missing_sample_size_allowed: - if (df_to_test['sample_size'].isnull().values.any()): + if df_to_test['sample_size'].isnull().values.any(): self.raised.append(ValidationError(None, "sample_size must not be NA")) - + # Find rows with sample size less than minimum allowed result = df_to_test.query('(sample_size < @minimum_sample_size)') if not result.empty: - self.raised.append(ValidationError(None, "sample size must be >= {minimum_sample_size}")) + self.raised.append(ValidationError(None, "sample size must be >= {minimum_sample_size}")) elif missing_sample_size_allowed: result = df_to_test.query('~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') @@ -253,14 +263,14 @@ def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option='unweighted'): """ Check if time since data was generated is reasonable or too long ago. - + Arguments: - max_date: date of most recent data to be validated; datetime format. - generation_date: date data to test was generated; datetime format. - weighted_option: str; selects the "reasonable" threshold Returns: - - None + - None """ switcher = { 'unweighted': timedelta(days=1), @@ -269,51 +279,45 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option= # Get the setting from switcher dictionary thres = switcher.get(weighted_option, lambda: "Invalid weighting option") - if (max_date < generation_date - thres): + if max_date < generation_date - thres: self.raised.append(ValidationError(None, "most recent date of generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, generation_date): """ Check if time since data was generated is reasonable or too recent. - + Arguments: - max_date: date of most recent data to be validated; datetime format. - generation_date: date data to test was generated; datetime format. Returns: - - None + - None """ - if (max_date < generation_date - timedelta(days=1)): + if max_date < generation_date - timedelta(days=1): self.raised.append(ValidationError(None, "most recent date of generated file seems too recent")) def check_max_date_vs_reference(self, df_to_test, df_to_reference): """ Check if reference data is more recent than test data. - + Arguments: - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data Returns: - - None + - None """ if df_to_test["date"].max() < df_to_reference["date"].max(): - self.raised.append(ValidationError((df_to_test["date"].max(), df_to_reference["date"].max()), - 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + - 'to handle this case, and this situation may indicate that something locally is out of date,' + - 'or, if the local working files have already been compared against the reference,' + - 'that there is a bug somewhere')) - - def reldiff_by_min(self, x, y): - """ - Calculate relative difference between two numbers. - """ - return (x - y) / min(x,y) + self.raised.append(ValidationError((df_to_test["date"].max(), df_to_reference["date"].max()), + 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + + 'to handle this case, and this situation may indicate that something locally is out of date,' + + 'or, if the local working files have already been compared against the reference,' + + 'that there is a bug somewhere')) def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_list, sig, geo): """ Compare number of obervations per day in test dataframe vs reference dataframe. - + Arguments: - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data @@ -323,25 +327,25 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_li - geo: str; geo type name (county, msa, hrr, state) as in the CSV name Returns: - - None + - None """ test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] == checking_date].shape[0] reference_rows_per_reporting_day = df_to_reference.shape[0] / len(date_list) - - if(abs(self.reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35): + + if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: self.raised.append(ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs test data)")) def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): """ Compare average values for each variable in test dataframe vs reference dataframe. - + Arguments: - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") Returns: - - None + - None """ # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() @@ -396,10 +400,10 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): + 'difference, relative to average values of corresponding variables. For the former' \ + 'check, tolerances for `val` are more restrictive than those for other columns.')) - def validate(self, export_dir, start_date, end_date, data_source, params={}, generation_date = date.today()): + def validate(self, export_dir, start_date, end_date, data_source, params, generation_date = date.today()): """ Runs all data checks. - + Arguments: - export_dir: path to data CSVs - start_date: beginning date of data to check @@ -409,10 +413,10 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test Returns: - - None + - None """ # Get user settings from params or if not provided, set default. - max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) + max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) minimum_sample_size = params.get('minimum_sample_size', 100) missing_se_allowed = params.get('missing_se_allowed', False) missing_sample_size_allowed = params.get('missing_sample_size_allowed', False) @@ -425,10 +429,10 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen export_files = read_filenames(export_dir) date_filter = self.make_date_filter(start_date, end_date) # List of tuples of CSV names and regex match objects. - validate_files = [(f, m) for (f, m) in export_files if date_filter(f,m)] + validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] all_frames = [] - + # TODO: Make weight_option based on signal name for Facebook data. See reference here: https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L207 self.check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') self.check_max_allowed_max_date(end_date, generation_date) @@ -451,12 +455,12 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen df['signal'] = match.groupdict()['signal'] # Add current CSV data to all_frames. - all_frames.append(df) + all_frames.append(df) # TODO: Multi-indexed dataframe for a given (signal, geo_type) all_frames = pd.concat(all_frames) - - # Get all expected combinations of geo_type and signal. + + # Get all expected combinations of geo_type and signal. geo_sig_cmbo = get_geo_sig_cmbo(data_source) # Get list of dates we expect to see in all the CSV data. @@ -464,7 +468,7 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) # Get list of CSV names. - filenames = [name_match_pair[0] for name_match_pair in validate_files] + filenames = [name_match_pair[0] for name_match_pair in validate_files] ## recent_lookbehind: start from the check date and working backward in time, ## how many days do we include in the window of date to check for anomalies? @@ -479,7 +483,7 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen # ## New from reference code. # # TODO: Check recent data against both semirecent and API data. - # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, + # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, # max(all_frames["date"]) - max_check_lookbehind + 1) # end_checking_date = max(all_frames["date"]) @@ -500,7 +504,7 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen # if (recent_df_to_test["se"].isnull().mean() > 0.5): # self.raised.append('Recent se values are >50% NA') - # self.check_max_date_vs_reference(recent_df_to_test, semirecent_df_to_test) + # self.check_max_date_vs_reference(recent_df_to_test, semirecent_df_to_test) # if sanity_check_rows_per_day: # self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) @@ -530,13 +534,13 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): - + m = smooth_option_regex.match(sig) smooth_option = m.group(1) if smooth_option not in ('raw', 'smoothed'): smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' - + #recent_df.set_index("time_value", inplace = True) print("Printing recent_df scenes:", recent_df.shape) print(recent_df) @@ -547,17 +551,17 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen recent_api_df = covidcast.signal(data_source, sig, recent_begin_date, recent_end_date, geo) # Replace None with NA to make numerical manipulation easier. - recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) + recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) # Rename columns to match those in df_to_test. recent_api_df.rename(columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace = True) recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) - + # Reorder columns. column_names = ["geo_id", "val", "se", "sample_size", "time_value"] recent_api_df = recent_api_df.reindex(columns=column_names) - if (recent_df["se"].isnull().mean() > 0.5): + if recent_df["se"].isnull().mean() > 0.5: self.raised.append('Recent se values are >50% NA') if sanity_check_rows_per_day: @@ -567,13 +571,16 @@ def validate(self, export_dir, start_date, end_date, data_source, params={}, gen self.check_avg_val_diffs(recent_df, recent_api_df, smooth_option) kroc += 1 - if kroc == 2: + if kroc == 2: break self.exit() - + def exit(self): + """ + If any exceptions were raised, print and exit with non-zero status. + """ if self.raised: print(len(self.raised), "messages") From 517b6d9ad3df39c605118fa352b737f382dcdd53 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 29 Sep 2020 16:58:00 -0400 Subject: [PATCH 055/151] Formatting improvements --- validator/delphi_validator/datafetcher.py | 25 ++- validator/delphi_validator/run.py | 9 +- validator/delphi_validator/validate.py | 221 +++++++++++++--------- 3 files changed, 155 insertions(+), 100 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 1f265833c..6c038c355 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -14,7 +14,8 @@ import covidcast from .errors import APIDataFetchError -filename_regex = re.compile(r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') +filename_regex = re.compile( + r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): @@ -46,7 +47,7 @@ def get_geo_sig_cmbo(data_source): combinations reported available by Covidcast metadata. """ meta = covidcast.metadata() - source_meta = meta[meta['data_source']==data_source] + source_meta = meta[meta['data_source'] == data_source] unique_signals = source_meta['signal'].unique().tolist() unique_geotypes = source_meta['geo_type'].unique().tolist() @@ -74,9 +75,11 @@ def read_filenames(path): Returns: - list of tuples """ - daily_filenames = [ (f, filename_regex.match(f)) for f in listdir(path) if isfile(join(path, f))] + daily_filenames = [(f, filename_regex.match(f)) + for f in listdir(path) if isfile(join(path, f))] return daily_filenames + def read_relevant_date_filenames(data_path, date_slist): """ Return a list of tuples of every filename in the specified directory if the file is in the specified date range. @@ -97,6 +100,7 @@ def read_relevant_date_filenames(data_path, date_slist): filenames.append(fl) return filenames + def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): """ Generator that assembles data within the specified date range for a given geo_sig_cmbo. @@ -116,10 +120,12 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): df_list = list() # Get all filenames for this geo_type and signal_type - files = [file for file in filenames if geo_sig[0] in file and geo_sig[1] in file] + files = [file for file in filenames if geo_sig[0] + in file and geo_sig[1] in file] if len(files) == 0: - print("FILE_NOT_FOUND: File with geo_type:", geo_sig[0], " and signal:", geo_sig[1], " does not exist!") + print("FILE_NOT_FOUND: File with geo_type:", + geo_sig[0], " and signal:", geo_sig[1], " does not exist!") yield pd.DataFrame(), geo_sig[0], geo_sig[1] continue @@ -136,6 +142,7 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): yield pd.concat(df_list), geo_sig[0], geo_sig[1] + def load_csv(path): """ Load CSV with specified column types. @@ -149,15 +156,17 @@ def load_csv(path): 'sample_size': float, }) + def fetch_daily_data(data_source, survey_date, geo_type, signal): """ Get API data for a specified date, source, signal, and geo type. """ - data_to_reference = covidcast.signal(data_source, signal, survey_date, survey_date, geo_type) + data_to_reference = covidcast.signal( + data_source, signal, survey_date, survey_date, geo_type) if not isinstance(data_to_reference, pd.DataFrame): - custom_msg = "Error fetching data on" + str(survey_date)+ \ + custom_msg = "Error fetching data on" + str(survey_date) + \ "for data source:" + data_source + \ - ", signal-type:"+ signal + \ + ", signal-type:" + signal + \ ", geography-type:" + geo_type raise APIDataFetchError(custom_msg) return data_to_reference diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 12883322b..c7cca45ae 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -14,8 +14,11 @@ def run_module(): params = parent_params['validation'] data_source = params['data_source'] - dtobj_sdate = datetime.date(datetime.strptime(params['start_date'], '%Y-%m-%d')) - dtobj_edate = datetime.date(datetime.strptime(params['end_date'], '%Y-%m-%d')) + dtobj_sdate = datetime.date( + datetime.strptime(params['start_date'], '%Y-%m-%d')) + dtobj_edate = datetime.date( + datetime.strptime(params['end_date'], '%Y-%m-%d')) validator = Validator() - validator.validate(parent_params["export_dir"], dtobj_sdate, dtobj_edate, data_source, params) + validator.validate(parent_params["export_dir"], + dtobj_sdate, dtobj_edate, data_source, params) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index eca355b61..20f496a02 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -29,10 +29,12 @@ def reldiff_by_min(x, y): """ Calculate relative difference between two numbers. """ - return (x - y) / min(x,y) + return (x - y) / min(x, y) + class ValidationError(Exception): """ Error raised when validation check fails. """ + def __init__(self, expression, message): """ Arguments: @@ -64,7 +66,6 @@ def make_date_filter(self, start_date, end_date): start_code = int(start_date.strftime("%Y%m%d")) end_code = int(end_date.strftime("%Y%m%d")) - def f(match): """ Return a boolean of whether a filename of appropriate format contains a date within the specified date range. @@ -104,19 +105,20 @@ def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generatio if not isinstance(max_check_lookbehind, timedelta): self.raised.append(ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be of type datetime.timedelta")) - if( not isinstance(generation_date, date) or generation_date > date.today()): + if not isinstance(generation_date, date) or generation_date > date.today(): self.raised.append(ValidationError(generation_date, f"generation_date must be a datetime.date type and not in the future.")) pattern_found = filename_regex.match(nameformat) - if (not nameformat or not pattern_found): - self.raised.append(ValidationError(nameformat, 'nameformat ({nameformat}) not recognized')) + if not nameformat or not pattern_found: + self.raised.append(ValidationError( + nameformat, 'nameformat ({nameformat}) not recognized')) if not isinstance(df_to_test, pd.DataFrame): - self.raised.append(ValidationError(nameformat, 'df_to_test must be a pandas dataframe.')) + self.raised.append(ValidationError( + nameformat, 'df_to_test must be a pandas dataframe.')) # TODO: check column names and types in df_to_test. Currently skipped since load_csv() specifies field names and types on read. Extra columns will simply be ignored during later processing. - def check_bad_geo_id(self, df_to_test, geo_type): """ Check validity of geo type and values, according to regex pattern. @@ -129,15 +131,18 @@ def check_bad_geo_id(self, df_to_test, geo_type): - None """ if geo_type not in negated_regex_dict: - self.raised.append(ValidationError(geo_type,"Unrecognized geo type")) + self.raised.append(ValidationError( + geo_type, "Unrecognized geo type")) def find_all_unexpected_geo_ids(df_to_test, negated_regex): """ Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. """ - unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall(negated_regex) if len(ugeo) > 0] + unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall( + negated_regex) if len(ugeo) > 0] if len(unexpected_geos) > 0: - self.raised.append(ValidationError(unexpected_geos,"Non-conforming geo_ids exist!")) + self.raised.append(ValidationError( + unexpected_geos, "Non-conforming geo_ids exist!")) find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) @@ -156,19 +161,22 @@ def check_missing_dates(self, daily_filenames, sdate, edate): number_of_dates = edate - sdate + timedelta(days=1) # Create set of all expected dates. - date_seq = {sdate + timedelta(days=x) for x in range(number_of_dates.days)} + date_seq = {sdate + timedelta(days=x) + for x in range(number_of_dates.days)} unique_dates = set() # Add each date seen in CSV names to set. for daily_filename in daily_filenames: - unique_dates.add(datetime.strptime(daily_filename[0][0:8], '%Y%m%d')) + unique_dates.add(datetime.strptime( + daily_filename[0][0:8], '%Y%m%d')) # Diff expected and observed dates. check_dateholes = list(date_seq.difference(unique_dates)) check_dateholes.sort() if check_dateholes: - self.raised.append(ValidationError(check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) + self.raised.append(ValidationError( + check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) def check_bad_val(self, df_to_test, signal_type): """ @@ -186,13 +194,16 @@ def check_bad_val(self, df_to_test, signal_type): if proportion_option: if not df_to_test[(df_to_test['val'] > 100)].empty: - self.raised.append(ValidationError(signal_type, "val column can't have any cell greater than 100")) + self.raised.append(ValidationError( + signal_type, "val column can't have any cell greater than 100")) if df_to_test['val'].isnull().values.any(): - self.raised.append(ValidationError(None,"val column can't have any cell that is NA")) + self.raised.append(ValidationError( + None, "val column can't have any cell that is NA")) if not df_to_test[(df_to_test['val'] < 0)].empty: - self.raised.append(ValidationError(None,"val column can't have any cell smaller than 0")) + self.raised.append(ValidationError( + None, "val column can't have any cell smaller than 0")) def check_bad_se(self, df_to_test, missing_se_allowed): """ @@ -206,9 +217,10 @@ def check_bad_se(self, df_to_test, missing_se_allowed): - None """ # Add a new se_upper_limit column. - df_to_test.eval('se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) + df_to_test.eval( + 'se_upper_limit = (val * sample_size + 50)/(sample_size + 1)', inplace=True) - df_to_test['se']= df_to_test['se'].round(3) + df_to_test['se'] = df_to_test['se'].round(3) df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) if not missing_se_allowed: @@ -216,21 +228,26 @@ def check_bad_se(self, df_to_test, missing_se_allowed): self.raised.append(ValidationError(None, "se must not be NA")) # Find rows not in the allowed range for se. - result = df_to_test.query('~((se > 0) & (se < 50) & (se <= se_upper_limit))') + result = df_to_test.query( + '~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - self.raised.append(ValidationError(None, "se must be in (0, min(50,val*(1+eps))]")) + self.raised.append(ValidationError( + None, "se must be in (0, min(50,val*(1+eps))]")) elif missing_se_allowed: - result = df_to_test.query('~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') + result = df_to_test.query( + '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: - self.raised.append(ValidationError(None, "se must be NA or in (0, min(50,val*(1+eps))]")) + self.raised.append(ValidationError( + None, "se must be NA or in (0, min(50,val*(1+eps))]")) result = df_to_test.query('(val == 0) & (se == 0)') if not result.empty: - self.raised.append(ValidationError(None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) + self.raised.append(ValidationError( + None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_size_allowed): """ @@ -246,19 +263,23 @@ def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_ """ if not missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): - self.raised.append(ValidationError(None, "sample_size must not be NA")) + self.raised.append(ValidationError( + None, "sample_size must not be NA")) # Find rows with sample size less than minimum allowed result = df_to_test.query('(sample_size < @minimum_sample_size)') if not result.empty: - self.raised.append(ValidationError(None, "sample size must be >= {minimum_sample_size}")) + self.raised.append(ValidationError( + None, "sample size must be >= {minimum_sample_size}")) elif missing_sample_size_allowed: - result = df_to_test.query('~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') + result = df_to_test.query( + '~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') if not result.empty: - self.raised.append(ValidationError(None, "sample size must be NA or >= {minimum_sample_size}")) + self.raised.append(ValidationError( + None, "sample size must be NA or >= {minimum_sample_size}")) def check_min_allowed_max_date(self, max_date, generation_date, weighted_option='unweighted'): """ @@ -277,10 +298,12 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option= 'weighted': timedelta(days=4) } # Get the setting from switcher dictionary - thres = switcher.get(weighted_option, lambda: "Invalid weighting option") + thres = switcher.get( + weighted_option, lambda: "Invalid weighting option") if max_date < generation_date - thres: - self.raised.append(ValidationError(None, "most recent date of generated file seems too long ago")) + self.raised.append(ValidationError( + None, "most recent date of generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, generation_date): """ @@ -294,7 +317,8 @@ def check_max_allowed_max_date(self, max_date, generation_date): - None """ if max_date < generation_date - timedelta(days=1): - self.raised.append(ValidationError(None, "most recent date of generated file seems too recent")) + self.raised.append(ValidationError( + None, "most recent date of generated file seems too recent")) def check_max_date_vs_reference(self, df_to_test, df_to_reference): """ @@ -309,10 +333,10 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference): """ if df_to_test["date"].max() < df_to_reference["date"].max(): self.raised.append(ValidationError((df_to_test["date"].max(), df_to_reference["date"].max()), - 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + - 'to handle this case, and this situation may indicate that something locally is out of date,' + - 'or, if the local working files have already been compared against the reference,' + - 'that there is a bug somewhere')) + 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + + 'to handle this case, and this situation may indicate that something locally is out of date,' + + 'or, if the local working files have already been compared against the reference,' + + 'that there is a bug somewhere')) def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_list, sig, geo): """ @@ -329,11 +353,14 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_li Returns: - None """ - test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] == checking_date].shape[0] - reference_rows_per_reporting_day = df_to_reference.shape[0] / len(date_list) + test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] + == checking_date].shape[0] + reference_rows_per_reporting_day = df_to_reference.shape[0] / len( + date_list) if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: - self.raised.append(ValidationError((checking_date,sig,geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs test data)")) + self.raised.append(ValidationError((checking_date, sig, geo), + "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs test data)")) def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): """ @@ -348,59 +375,66 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): - None """ # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. - df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[ + ['val', 'se', 'sample_size']].mean() df_to_test["type"] = "test" - df_to_reference = df_to_reference.groupby(['geo_id'], as_index=False)[['val', 'se', 'sample_size']].mean() + df_to_reference = df_to_reference.groupby(['geo_id'], as_index=False)[ + ['val', 'se', 'sample_size']].mean() df_to_reference["type"] = "reference" df_all = pd.concat([df_to_test, df_to_reference]) # For each variable (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. - df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val","se","sample_size"] - ).pivot(index=("geo_id", "variable"), columns="type", values="value" - ).reset_index(("geo_id","variable") - ).dropna( - ).assign( - type_diff=lambda x: x["test"] - x["reference"], - abs_type_diff=lambda x: abs(x["type_diff"]) - ).groupby("variable", as_index=False - ).agg( - mean_type_diff=("type_diff", "mean"), - mean_abs_type_diff=("abs_type_diff", "mean"), - mean_test_var=("test", "mean"), - mean_ref_var=("reference", "mean") - ).assign( - mean_stddiff=lambda x: 2 * x["mean_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]), - mean_stdabsdiff=lambda x: 2 * x["mean_abs_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]) - )[["variable", "mean_stddiff", "mean_stdabsdiff"]] + df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val", "se", "sample_size"] + ).pivot(index=("geo_id", "variable"), columns="type", values="value" + ).reset_index(("geo_id", "variable") + ).dropna( + ).assign( + type_diff=lambda x: x["test"] - x["reference"], + abs_type_diff=lambda x: abs(x["type_diff"]) + ).groupby("variable", as_index=False + ).agg( + mean_type_diff=("type_diff", "mean"), + mean_abs_type_diff=("abs_type_diff", "mean"), + mean_test_var=("test", "mean"), + mean_ref_var=("reference", "mean") + ).assign( + mean_stddiff=lambda x: 2 * + x["mean_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]), + mean_stdabsdiff=lambda x: 2 * + x["mean_abs_type_diff"] / (x["mean_test_var"] + x["mean_ref_var"]) + )[["variable", "mean_stddiff", "mean_stdabsdiff"]] # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes).T - smoothed_thresholds = raw_thresholds.apply(lambda x: x/(math.sqrt(7) * 1.5)) + smoothed_thresholds = raw_thresholds.apply( + lambda x: x/(math.sqrt(7) * 1.5)) switcher = { - 'raw': raw_thresholds, - 'smoothed': smoothed_thresholds, + 'raw': raw_thresholds, + 'smoothed': smoothed_thresholds, } # Get the selected thresholds from switcher dictionary thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. - mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ((df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) - mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() + mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ( + (df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) + mean_stdabsdiff_high = ( + df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() flag = mean_stddiff_high or mean_stdabsdiff_high if flag: - self.raised.append(ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & refernce data (either semirecent or from API) seem' \ - + 'large --- either large increase tending toward one direction or large mean absolute' \ - + 'difference, relative to average values of corresponding variables. For the former' \ - + 'check, tolerances for `val` are more restrictive than those for other columns.')) + self.raised.append(ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & refernce data (either semirecent or from API) seem' + + 'large --- either large increase tending toward one direction or large mean absolute' + + 'difference, relative to average values of corresponding variables. For the former' + + 'check, tolerances for `val` are more restrictive than those for other columns.')) - def validate(self, export_dir, start_date, end_date, data_source, params, generation_date = date.today()): + def validate(self, export_dir, start_date, end_date, data_source, params, generation_date=date.today()): """ Runs all data checks. @@ -419,13 +453,14 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) minimum_sample_size = params.get('minimum_sample_size', 100) missing_se_allowed = params.get('missing_se_allowed', False) - missing_sample_size_allowed = params.get('missing_sample_size_allowed', False) + missing_sample_size_allowed = params.get( + 'missing_sample_size_allowed', False) - sanity_check_rows_per_day = params.get('sanity_check_rows_per_day', True) + sanity_check_rows_per_day = params.get( + 'sanity_check_rows_per_day', True) sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) check_vs_working = params.get('check_vs_working', True) - export_files = read_filenames(export_dir) date_filter = self.make_date_filter(start_date, end_date) # List of tuples of CSV names and regex match objects. @@ -434,7 +469,8 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera all_frames = [] # TODO: Make weight_option based on signal name for Facebook data. See reference here: https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L207 - self.check_min_allowed_max_date(end_date, generation_date, weighted_option='unweighted') + self.check_min_allowed_max_date( + end_date, generation_date, weighted_option='unweighted') self.check_max_allowed_max_date(end_date, generation_date) self.check_missing_dates(validate_files, start_date, end_date) @@ -443,11 +479,13 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera for filename, match in validate_files: df = load_csv(join(export_dir, filename)) - self.validate_daily(df, filename, max_check_lookbehind, generation_date) + self.validate_daily( + df, filename, max_check_lookbehind, generation_date) self.check_bad_geo_id(df, match.groupdict()['geo_type']) self.check_bad_val(df, match.groupdict()['signal']) self.check_bad_se(df, missing_se_allowed) - self.check_bad_sample_size(df, minimum_sample_size, missing_sample_size_allowed) + self.check_bad_sample_size( + df, minimum_sample_size, missing_sample_size_allowed) # Get geo_type, date, and signal name as specified by CSV name. df['geo_type'] = match.groupdict()['geo_type'] @@ -465,22 +503,22 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # Get list of dates we expect to see in all the CSV data. date_slist = all_frames['date'].unique().tolist() - date_list = list(map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) + date_list = list( + map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) # Get list of CSV names. filenames = [name_match_pair[0] for name_match_pair in validate_files] - ## recent_lookbehind: start from the check date and working backward in time, - ## how many days do we include in the window of date to check for anomalies? - ## Choosing 1 day checks just the check data itself. + # recent_lookbehind: start from the check date and working backward in time, + # how many days do we include in the window of date to check for anomalies? + # Choosing 1 day checks just the check data itself. recent_lookbehind = timedelta(days=1) - ## semirecent_lookbehind: starting from the check date and working backward - ## in time, how many days -- before subtracting out the "recent" days --- - ## do we use to form the reference statistics? + # semirecent_lookbehind: starting from the check date and working backward + # in time, how many days -- before subtracting out the "recent" days --- + # do we use to form the reference statistics? semirecent_lookbehind = timedelta(days=7) - # ## New from reference code. # # TODO: Check recent data against both semirecent and API data. # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, @@ -527,7 +565,6 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # # create new columns checking whether a given field (val, se, sample_size) is missing in both test and reference data, or if it is within a set tolerance # # filter data to keep only rows where one of the new columns is true. if df is not empty, raise error that "mismatches between less recent data in data frame to test and corresponding reference data frame" - smooth_option_regex = re.compile(r'([^_]+)') kroc = 0 @@ -548,27 +585,34 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # -recent- dataframe run backwards from the checking_date recent_end_date = checking_date - recent_lookbehind recent_begin_date = checking_date - max_check_lookbehind - recent_api_df = covidcast.signal(data_source, sig, recent_begin_date, recent_end_date, geo) + recent_api_df = covidcast.signal( + data_source, sig, recent_begin_date, recent_end_date, geo) # Replace None with NA to make numerical manipulation easier. - recent_api_df.replace(to_replace=[None], value=np.nan, inplace=True) + recent_api_df.replace( + to_replace=[None], value=np.nan, inplace=True) # Rename columns to match those in df_to_test. - recent_api_df.rename(columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace = True) - recent_api_df.drop(['direction', 'issue', 'lag'], axis=1, inplace = True) + recent_api_df.rename( + columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace=True) + recent_api_df.drop( + ['direction', 'issue', 'lag'], axis=1, inplace=True) # Reorder columns. - column_names = ["geo_id", "val", "se", "sample_size", "time_value"] + column_names = ["geo_id", "val", + "se", "sample_size", "time_value"] recent_api_df = recent_api_df.reindex(columns=column_names) if recent_df["se"].isnull().mean() > 0.5: self.raised.append('Recent se values are >50% NA') if sanity_check_rows_per_day: - self.check_rapid_change(recent_df, recent_api_df, checking_date, date_list, sig, geo) + self.check_rapid_change( + recent_df, recent_api_df, checking_date, date_list, sig, geo) if sanity_check_value_diffs: - self.check_avg_val_diffs(recent_df, recent_api_df, smooth_option) + self.check_avg_val_diffs( + recent_df, recent_api_df, smooth_option) kroc += 1 if kroc == 2: @@ -576,7 +620,6 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera self.exit() - def exit(self): """ If any exceptions were raised, print and exit with non-zero status. From c5cce713e56ec64b81628e9db8026593bbe6008d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 29 Sep 2020 18:46:19 -0400 Subject: [PATCH 056/151] Updated README and template params --- validator/README.md | 44 ++++++++++++++++++++++------------ validator/params.json.template | 12 ++++++---- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/validator/README.md b/validator/README.md index af0af1071..f00ed322f 100644 --- a/validator/README.md +++ b/validator/README.md @@ -1,36 +1,43 @@ # Validator The validator performs two main tasks: -1) Sanity checks on daily data generated from a pipeline of specific data +1) Sanity checks on daily data generated from the pipeline of a specific data source. -2) Its does a comparative analysis with recent data from the API - to detect any anomalies such as spikes, significant value differences +2) Comparative analysis with recent data from the API + to detect any anomalies, such as spikes or significant value differences -The validator validates against daily data thats already written in the disk +The validator validates new source data against daily data that is already written to disk, making the execution of the validator independent of the pipeline execution. -This creates an additional advantage of running the validation against multiple -days of daily data and have a better cummulative analysis. +This creates the additional advantage of validating against multiple +days of daily data for a better cummulative analysis. -## Running the Indicator +## Running the Validator -The indicator is run by directly executing the Python module contained in this -directory. The safest way to do this is to create a virtual environment, -installed the common DELPHI tools, and then install the module and its -dependencies. To do this, run the following code from this directory: +The validator is run by executing the Python module contained in this +directory from the main directory of the indicator of interest. + +The safest way to do this is to create a virtual environment, +install the common DELPHI tools, install the indicator module and its +dependencies, and then install the validator module and its +dependencies to the virtual environment. To do this, navigate to the main directory of the indicator of interest and run the following code: ``` python -m venv env source env/bin/activate pip install ../_delphi_utils_python/. pip install . +pip install -e ../validator ``` -All of the user-changable parameters are stored in `params.json`. To execute -the module and produce the output datasets (by default, in `receiving`), run -the following: +All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `start_date`, and `end_date`. The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. + +To execute +the module and validate source data (by default, in `receiving`), run the indicator to generate data files, then run +the validator, as follows: ``` +env/bin/python -m delphi_INDICATORNAME env/bin/python -m delphi_validator ``` @@ -52,4 +59,11 @@ env/bin/pylint delphi_validator ``` The most aggressive checks are turned off; only relatively important issues -should be raised and they should be manually checked (or better, fixed). \ No newline at end of file +should be raised and they should be manually checked (or better, fixed). + +## Code tour + +* run.py: sends params.json fields to and runs the validation process +* datafetcher.py: methods for loading source data +* validate.py: methods for validating source data. Includes the individual check functions. +* errors.py: custom validation errors \ No newline at end of file diff --git a/validator/params.json.template b/validator/params.json.template index f933086d2..128f4d10e 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,9 +1,11 @@ { "validation": { - "data_source": "fb-survey", - "start_date": "2020-06-13", - "end_date": "2020-06-19", - "ref_window_size": 7, - "minimum_sample_size": 100 + "data_source": "usa-facts", + "start_date": "2020-09-05", + "end_date": "2020-09-08", + "ref_window_size": 7, + "minimum_sample_size": 100, + "missing_se_allowed": true, + "missing_sample_size_allowed": true } } \ No newline at end of file From 37d93af693422207351b7ec1f5d411dccbf1641c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 30 Sep 2020 09:38:26 -0400 Subject: [PATCH 057/151] Updated README with new check info --- validator/README.md | 17 +++++++++++++---- validator/REVIEW.md | 2 -- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/validator/README.md b/validator/README.md index f00ed322f..7402d9362 100644 --- a/validator/README.md +++ b/validator/README.md @@ -20,7 +20,9 @@ directory from the main directory of the indicator of interest. The safest way to do this is to create a virtual environment, install the common DELPHI tools, install the indicator module and its dependencies, and then install the validator module and its -dependencies to the virtual environment. To do this, navigate to the main directory of the indicator of interest and run the following code: +dependencies to the virtual environment. + +To do this, navigate to the main directory of the indicator of interest and run the following code: ``` python -m venv env @@ -32,8 +34,7 @@ pip install -e ../validator All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `start_date`, and `end_date`. The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. -To execute -the module and validate source data (by default, in `receiving`), run the indicator to generate data files, then run +To execute the module and validate source data (by default, in `receiving`), run the indicator to generate data files, then run the validator, as follows: ``` @@ -61,9 +62,17 @@ env/bin/pylint delphi_validator The most aggressive checks are turned off; only relatively important issues should be raised and they should be manually checked (or better, fixed). + ## Code tour * run.py: sends params.json fields to and runs the validation process * datafetcher.py: methods for loading source data * validate.py: methods for validating source data. Includes the individual check functions. -* errors.py: custom validation errors \ No newline at end of file +* errors.py: custom validation errors + + +## Adding checks + +To add a new validation check, define the check as a `Validator` class method in `validate.py`. Each check should append a descriptive error message to the `raised` attribute if triggered. All checks should allow the user to override exception raising for a specific file using the `exception_override` setting in `params.json`. + +Add the newly defined check to the `validate()` method to be executed. It should go in one of two sections: data sanity checks where a data file is compared against static format settings, or data trend and value checks where a set of data is compared against API data. \ No newline at end of file diff --git a/validator/REVIEW.md b/validator/REVIEW.md index 93a5a6579..d7dd2ce77 100644 --- a/validator/REVIEW.md +++ b/validator/REVIEW.md @@ -20,8 +20,6 @@ flexible, but be consistent within a module within the directory `static` - [ ] any intermediate files that are created and stored by the module should be placed in the directory `cache` -- [ ] final expected output files to be uploaded to the API are placed in the -`receiving` directory; output files should not be committed to the respository - [ ] all options and API keys are passed through the file `params.json` - [ ] template parameter file (`params.json.template`) is checked into the code; no personal (i.e., usernames) or private (i.e., API keys) information is From 32fb967def335143affa59f3f66b0acc97c1bfeb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 30 Sep 2020 10:02:57 -0400 Subject: [PATCH 058/151] Added unittesting info in README and directory --- validator/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/validator/README.md b/validator/README.md index 7402d9362..9d827d836 100644 --- a/validator/README.md +++ b/validator/README.md @@ -52,6 +52,15 @@ rm -r env ## Testing the code +To test the code, please create a new virtual environment in the main module directory using the following procedure, similar to above: + +``` +python -m venv env +source env/bin/activate +pip install ../_delphi_utils_python/. +pip install . +``` + To do a static test of the code style, it is recommended to run **pylint** on the module. To do this, run the following from the main module directory: @@ -62,6 +71,14 @@ env/bin/pylint delphi_validator The most aggressive checks are turned off; only relatively important issues should be raised and they should be manually checked (or better, fixed). +Unit tests are also included in the module. To execute these, run the following command from this directory: + +``` +(cd tests && ../env/bin/pytest --cov=delphi_validator --cov-report=term-missing) +``` + +The output will show the number of unit tests that passed and failed, along with the percentage of code covered by the tests. None of the tests should fail and the code lines that are not covered by unit tests should be small and should not include critical sub-routines. + ## Code tour From 2834df797796530c64590b7ca9b9d40ddcdcfd33 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 1 Oct 2020 18:04:23 -0400 Subject: [PATCH 059/151] validate() comparison checks now match reference code --- validator/delphi_validator/validate.py | 164 ++++++++++--------------- validator/tests/test_checks.py | 18 +++ 2 files changed, 86 insertions(+), 96 deletions(-) create mode 100644 validator/tests/test_checks.py diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 20f496a02..ddc0acf37 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -14,6 +14,7 @@ import covidcast from .datafetcher import load_csv, read_geo_sig_cmbo_files, read_filenames, get_geo_sig_cmbo, filename_regex +import pdb # Recognized geo types. negated_regex_dict = { @@ -190,20 +191,26 @@ def check_bad_val(self, df_to_test, signal_type): - None """ # Determine if signal is a proportion or percent - proportion_option = bool('prop' in signal_type or 'pct' in signal_type) + percent_option = bool('pct' in signal_type) + proportion_option = bool('pct' in signal_type) - if proportion_option: + if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: self.raised.append(ValidationError( - signal_type, "val column can't have any cell greater than 100")) + signal_type, "val column can't have any cell greater than 100 for percents")) + + if proportion_option: + if not df_to_test[(df_to_test['val'] > 100000)].empty: + self.raised.append(ValidationError( + signal_type, "val column can't have any cell greater than 100000 for proportions")) if df_to_test['val'].isnull().values.any(): self.raised.append(ValidationError( - None, "val column can't have any cell that is NA")) + signal_type, "val column can't have any cell that is NA")) if not df_to_test[(df_to_test['val'] < 0)].empty: self.raised.append(ValidationError( - None, "val column can't have any cell smaller than 0")) + signal_type, "val column can't have any cell smaller than 0")) def check_bad_se(self, df_to_test, missing_se_allowed): """ @@ -316,7 +323,7 @@ def check_max_allowed_max_date(self, max_date, generation_date): Returns: - None """ - if max_date < generation_date - timedelta(days=1): + if max_date > generation_date - timedelta(days=1): self.raised.append(ValidationError( None, "most recent date of generated file seems too recent")) @@ -338,7 +345,7 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference): 'or, if the local working files have already been compared against the reference,' + 'that there is a bug somewhere')) - def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_list, sig, geo): + def check_rapid_change(self, df_to_test, df_to_reference, checking_date, sig, geo): """ Compare number of obervations per day in test dataframe vs reference dataframe. @@ -356,7 +363,7 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, date_li test_rows_per_reporting_day = df_to_test[df_to_test['time_value'] == checking_date].shape[0] reference_rows_per_reporting_day = df_to_reference.shape[0] / len( - date_list) + set(df_to_reference["time_value"])) if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: self.raised.append(ValidationError((checking_date, sig, geo), @@ -429,7 +436,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): flag = mean_stddiff_high or mean_stdabsdiff_high if flag: - self.raised.append(ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & refernce data (either semirecent or from API) seem' + self.raised.append(ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data (either semirecent or from API) seem' + 'large --- either large increase tending toward one direction or large mean absolute' + 'difference, relative to average values of corresponding variables. For the former' + 'check, tolerances for `val` are more restrictive than those for other columns.')) @@ -449,6 +456,7 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera Returns: - None """ + # Setup # Get user settings from params or if not provided, set default. max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) minimum_sample_size = params.get('minimum_sample_size', 100) @@ -461,21 +469,23 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) check_vs_working = params.get('check_vs_working', True) + # Get relevant data file names and info. export_files = read_filenames(export_dir) date_filter = self.make_date_filter(start_date, end_date) # List of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] + # Get list of just CSV names. + filenames = [name_match_pair[0] for name_match_pair in validate_files] - all_frames = [] - - # TODO: Make weight_option based on signal name for Facebook data. See reference here: https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L207 - self.check_min_allowed_max_date( - end_date, generation_date, weighted_option='unweighted') - self.check_max_allowed_max_date(end_date, generation_date) + # Get all expected combinations of geo_type and signal. + geo_sig_cmbo = get_geo_sig_cmbo(data_source) self.check_missing_dates(validate_files, start_date, end_date) - # For every file, read in and do some basic format and value checks. + all_frames = [] + + # Individual file checks + # For every daily file, read in and do some basic format and value checks. for filename, match in validate_files: df = load_csv(join(export_dir, filename)) @@ -486,8 +496,10 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera self.check_bad_se(df, missing_se_allowed) self.check_bad_sample_size( df, minimum_sample_size, missing_sample_size_allowed) - # Get geo_type, date, and signal name as specified by CSV name. + # TODO: Check to see, if this date is in the API, values have been updated and changed significantly. + + # Get geo_type, date, and signal name as specified by CSV name. df['geo_type'] = match.groupdict()['geo_type'] df['date'] = match.groupdict()['date'] df['signal'] = match.groupdict()['signal'] @@ -498,79 +510,29 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # TODO: Multi-indexed dataframe for a given (signal, geo_type) all_frames = pd.concat(all_frames) - # Get all expected combinations of geo_type and signal. - geo_sig_cmbo = get_geo_sig_cmbo(data_source) - # Get list of dates we expect to see in all the CSV data. date_slist = all_frames['date'].unique().tolist() date_list = list( map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) - # Get list of CSV names. - filenames = [name_match_pair[0] for name_match_pair in validate_files] - # recent_lookbehind: start from the check date and working backward in time, - # how many days do we include in the window of date to check for anomalies? - # Choosing 1 day checks just the check data itself. + # how many days do we want to check for anomalies? + # Choosing 1 day checks just the daily data. recent_lookbehind = timedelta(days=1) # semirecent_lookbehind: starting from the check date and working backward - # in time, how many days -- before subtracting out the "recent" days --- - # do we use to form the reference statistics? + # in time, how many days do we use to form the reference statistics. semirecent_lookbehind = timedelta(days=7) - # ## New from reference code. - # # TODO: Check recent data against both semirecent and API data. - # start_checking_date = max(min(all_frames["date"]) + semirecent_lookbehind - 1, - # max(all_frames["date"]) - max_check_lookbehind + 1) - # end_checking_date = max(all_frames["date"]) - - # if (start_checking_date > end_checking_date): - # self.raised.append(ValidationError((start_checking_date, end_checking_date), "not enough days included in the dataframe to perform sanity checks")) - - # # Loop over all sets of dates for a given CSV. - # for checking_date in range(start_checking_date, end_checking_date): - # # TODO: Implement get_known_irregularities(). Add irregularity flags to other checks. - # known_irregularities = get_known_irregularities(checking_date, filename) - - # recent_cutoff_date = checking_date - recent_lookbehind + 1 - # semirecent_cutoff_date = checking_date - semirecent_lookbehind + 1 - - # recent_df_to_test = all_frames.query('date <= @checking_date & date >= @recent_cutoff_date') - # semirecent_df_to_test = all_frames.query('date <= @checking_date & date < @recent_cutoff_date & date >= @semirecent_cutoff_date') - - # if (recent_df_to_test["se"].isnull().mean() > 0.5): - # self.raised.append('Recent se values are >50% NA') - - # self.check_max_date_vs_reference(recent_df_to_test, semirecent_df_to_test) - - # if sanity_check_rows_per_day: - # self.check_rapid_change(recent_df_to_test, semirecent_df_to_test) - - # # TODO: Get smooth_option from CSV name. - # if sanity_check_value_diffs: - # self.check_avg_val_diffs(recent_df_to_test, semirecent_df_to_test, smooth_option) - - # if check_vs_working: - # pass - - # ## Goal is to see if past data has been changed. - # # check_fbsurvey_generated_covidalert_vs_working(covidalert.df.to.test, specified.signal, geo, start.checking.date-1L) - # # Get all files for a given indicator. - # # get_working_covidalert_daily("fb-survey", signal, geo.type, fbsurvey.extra.listcolspec) - # # filter by date to keep only rows with date <= end_comparison_date, and date >= min date seen in both reference and test data - # # if, after filtering, the reference data has 0 rows, stop with a message "not any reference data in the reference period" - - # # full join test and reference data on geo_id and date - # # create new columns checking whether a given field (val, se, sample_size) is missing in both test and reference data, or if it is within a set tolerance - # # filter data to keep only rows where one of the new columns is true. if df is not empty, raise error that "mismatches between less recent data in data frame to test and corresponding reference data frame" - smooth_option_regex = re.compile(r'([^_]+)') + # TODO: Remove for actual version kroc = 0 + # Comparison checks # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). - for recent_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): + # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. + for geo_sig_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): m = smooth_option_regex.match(sig) smooth_option = m.group(1) @@ -578,42 +540,52 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera if smooth_option not in ('raw', 'smoothed'): smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' - #recent_df.set_index("time_value", inplace = True) - print("Printing recent_df scenes:", recent_df.shape) - print(recent_df) - for checking_date in date_list: - # -recent- dataframe run backwards from the checking_date - recent_end_date = checking_date - recent_lookbehind - recent_begin_date = checking_date - max_check_lookbehind - recent_api_df = covidcast.signal( - data_source, sig, recent_begin_date, recent_end_date, geo) + weight_option = 'weighted' if 'wili' in sig or 'wcli' in sig else 'unweighted' - # Replace None with NA to make numerical manipulation easier. - recent_api_df.replace( - to_replace=[None], value=np.nan, inplace=True) + print("Printing geo_sig_df scenes:", geo_sig_df.shape) + print(geo_sig_df) - # Rename columns to match those in df_to_test. - recent_api_df.rename( - columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}, inplace=True) - recent_api_df.drop( - ['direction', 'issue', 'lag'], axis=1, inplace=True) + max_date = geo_sig_df["time_value"].max() + self.check_min_allowed_max_date( + max_date, generation_date, weight_option) + self.check_max_allowed_max_date(max_date, generation_date) + + # Check data from a group of dates against recent (previous 7 days, by default) data from the API. + for checking_date in date_list: + recent_cutoff_date = checking_date - recent_lookbehind + recent_df = geo_sig_df.query( + 'time_value <= @checking_date & time_value >= @recent_cutoff_date') + + # Reference dataframe runs backwards from the checking_date + reference_start_date = checking_date - \ + min(semirecent_lookbehind, max_check_lookbehind) + reference_end_date = recent_cutoff_date - timedelta(days=1) + reference_api_df = covidcast.signal( + data_source, sig, reference_start_date, reference_end_date, geo) - # Reorder columns. column_names = ["geo_id", "val", "se", "sample_size", "time_value"] - recent_api_df = recent_api_df.reindex(columns=column_names) + + # Replace None with NA to make numerical manipulation easier. + # Rename and reorder columns to match those in df_to_test. + reference_api_df = reference_api_df.replace( + to_replace=[None], value=np.nan).rename( + columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}).drop( + ['direction', 'issue', 'lag'], axis=1).reindex(columns=column_names) if recent_df["se"].isnull().mean() > 0.5: - self.raised.append('Recent se values are >50% NA') + self.raised.append( + ((sig, geo, checking_date), 'Recent se values are >50% NA')) if sanity_check_rows_per_day: self.check_rapid_change( - recent_df, recent_api_df, checking_date, date_list, sig, geo) + recent_df, reference_api_df, checking_date, sig, geo) if sanity_check_value_diffs: self.check_avg_val_diffs( - recent_df, recent_api_df, smooth_option) + recent_df, reference_api_df, smooth_option) + # TODO: Remove for actual version kroc += 1 if kroc == 2: break diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py new file mode 100644 index 000000000..932cacaf9 --- /dev/null +++ b/validator/tests/test_checks.py @@ -0,0 +1,18 @@ +import pytest +import pandas as pd + +from delphi_validator.validate import Validator + +# # Define constants. +# PARAMS = read_params() +# DATA_FILEPATH = PARAMS["input_file"] + + +class TestCheckBadVal: + validator = Validator() + + def test_empty_df(self): + empty_df = pd.DataFrame(columns=["val"]) + self.validator.check_bad_val(empty_df, "") + + assert len(self.validator.raised) == 0 From ddb13c34f2d29eb19f84b6c7cec1a586d0496a1a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 2 Oct 2020 12:08:11 -0400 Subject: [PATCH 060/151] Removed unused datafetch functions. Added more identifying info to error messages. Fixed bug in check_missing_dates() method --- validator/delphi_validator/datafetcher.py | 44 ------ validator/delphi_validator/validate.py | 160 +++++++++++----------- 2 files changed, 82 insertions(+), 122 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 6c038c355..ed70051d8 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -18,29 +18,6 @@ r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') -def get_filenames_with_geo_signal(path, data_source, date_slist: List[str]): - """ - Gets list of filenames in data folder and list of expected geo type-signal type combinations. - - Arguments: - - path: path to data CSVs - - data_source: str; data source name, one of - https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html - - date_slist: list of dates (formatted as strings) to check - - Returns: - - list of filenames - - list of geo type-signal type combinations that we expect to see - """ - geo_sig_cmbo = get_geo_sig_cmbo(data_source) - - for cmb in geo_sig_cmbo: - print(cmb) - - filenames = read_relevant_date_filenames(path, date_slist[0]) - return filenames, geo_sig_cmbo - - def get_geo_sig_cmbo(data_source): """ Get list of geo type-signal type combinations that we expect to see, based on @@ -80,27 +57,6 @@ def read_filenames(path): return daily_filenames -def read_relevant_date_filenames(data_path, date_slist): - """ - Return a list of tuples of every filename in the specified directory if the file is in the specified date range. - - Arguments: - - data_path: path to the directory containing CSV data files. - - date_slist: list of dates (formatted as strings) to check - - Returns: - - list - """ - all_files = [f for f in listdir(data_path) if isfile(join(data_path, f))] - filenames = list() - - for fl in all_files: - for dt in date_slist: - if fl.find(dt) != -1: - filenames.append(fl) - return filenames - - def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): """ Generator that assembles data within the specified date range for a given geo_sig_cmbo. diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index ddc0acf37..8bb859d66 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -54,7 +54,7 @@ def __init__(self): def make_date_filter(self, start_date, end_date): """ - Create a function to return a boolean of whether a filename of appropriate format contains a date within the specified date range. + Create a function to return a boolean of whether a filename of appropriate format contains a date within (inclusive) the specified date range. Arguments: - start_date: datetime date object @@ -85,7 +85,7 @@ def f(match): code = int(match.groupdict()['date']) # Return boolean True if current file date "code" is within the defined date range. - return start_code < code < end_code + return start_code <= code <= end_code return f @@ -102,25 +102,53 @@ def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generatio Returns: - None """ - if not isinstance(max_check_lookbehind, timedelta): - self.raised.append(ValidationError(max_check_lookbehind, f"max_check_lookbehind ({max_check_lookbehind}) must be of type datetime.timedelta")) + self.raised.append(ValidationError( + max_check_lookbehind, "max_check_lookbehind must be of type datetime.timedelta")) if not isinstance(generation_date, date) or generation_date > date.today(): - self.raised.append(ValidationError(generation_date, f"generation_date must be a datetime.date type and not in the future.")) + self.raised.append(ValidationError( + generation_date, "generation_date must be a datetime.date type and not in the future.")) pattern_found = filename_regex.match(nameformat) if not nameformat or not pattern_found: self.raised.append(ValidationError( - nameformat, 'nameformat ({nameformat}) not recognized')) + nameformat, 'nameformat not recognized')) if not isinstance(df_to_test, pd.DataFrame): self.raised.append(ValidationError( nameformat, 'df_to_test must be a pandas dataframe.')) - # TODO: check column names and types in df_to_test. Currently skipped since load_csv() specifies field names and types on read. Extra columns will simply be ignored during later processing. + def check_missing_dates(self, daily_filenames, start_date, end_date): + """ + Check for missing dates between the specified start and end dates. + + Arguments: + - daily_filenames: list of CSV source data filenames. + - sdate: start date, in datetime format + - edate: end date, in datetime format - def check_bad_geo_id(self, df_to_test, geo_type): + Returns: + - None + """ + number_of_dates = end_date - start_date + timedelta(days=1) + + # Create set of all expected dates. + date_seq = {start_date + timedelta(days=x) + for x in range(number_of_dates.days)} + # Create set of all dates seen in CSV names. + unique_dates = {datetime.strptime( + daily_filename[0][0:8], '%Y%m%d').date() for daily_filename in daily_filenames} + + # Diff expected and observed dates. + check_dateholes = list(date_seq.difference(unique_dates)) + check_dateholes.sort() + + if check_dateholes: + self.raised.append(ValidationError( + check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) + + def check_bad_geo_id(self, df_to_test, nameformat, geo_type): """ Check validity of geo type and values, according to regex pattern. @@ -133,7 +161,7 @@ def check_bad_geo_id(self, df_to_test, geo_type): """ if geo_type not in negated_regex_dict: self.raised.append(ValidationError( - geo_type, "Unrecognized geo type")) + (nameformat, geo_type), "Unrecognized geo type")) def find_all_unexpected_geo_ids(df_to_test, negated_regex): """ @@ -143,43 +171,11 @@ def find_all_unexpected_geo_ids(df_to_test, negated_regex): negated_regex) if len(ugeo) > 0] if len(unexpected_geos) > 0: self.raised.append(ValidationError( - unexpected_geos, "Non-conforming geo_ids exist!")) + (nameformat, unexpected_geos), "Non-conforming geo_ids found")) find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) - def check_missing_dates(self, daily_filenames, sdate, edate): - """ - Check for missing dates between the specified start and end dates. - - Arguments: - - daily_filenames: list of CSV source data filenames. - - sdate: start date, in datetime format - - edate: end date, in datetime format - - Returns: - - None - """ - number_of_dates = edate - sdate + timedelta(days=1) - - # Create set of all expected dates. - date_seq = {sdate + timedelta(days=x) - for x in range(number_of_dates.days)} - unique_dates = set() - - # Add each date seen in CSV names to set. - for daily_filename in daily_filenames: - unique_dates.add(datetime.strptime( - daily_filename[0][0:8], '%Y%m%d')) - - # Diff expected and observed dates. - check_dateholes = list(date_seq.difference(unique_dates)) - check_dateholes.sort() - - if check_dateholes: - self.raised.append(ValidationError( - check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) - - def check_bad_val(self, df_to_test, signal_type): + def check_bad_val(self, df_to_test, nameformat, signal_type): """ Check value field for validity. @@ -197,22 +193,22 @@ def check_bad_val(self, df_to_test, signal_type): if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: self.raised.append(ValidationError( - signal_type, "val column can't have any cell greater than 100 for percents")) + nameformat, "val column can't have any cell greater than 100 for percents")) if proportion_option: if not df_to_test[(df_to_test['val'] > 100000)].empty: self.raised.append(ValidationError( - signal_type, "val column can't have any cell greater than 100000 for proportions")) + signal_type, "val column can't have any cell greater than 100000 for nameformat")) if df_to_test['val'].isnull().values.any(): self.raised.append(ValidationError( - signal_type, "val column can't have any cell that is NA")) + nameformat, "val column can't have any cell that is NA")) if not df_to_test[(df_to_test['val'] < 0)].empty: self.raised.append(ValidationError( - signal_type, "val column can't have any cell smaller than 0")) + nameformat, "val column can't have any cell smaller than 0")) - def check_bad_se(self, df_to_test, missing_se_allowed): + def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): """ Check standard errors for validity. @@ -232,7 +228,8 @@ def check_bad_se(self, df_to_test, missing_se_allowed): if not missing_se_allowed: if df_to_test['se'].isnull().values.any(): - self.raised.append(ValidationError(None, "se must not be NA")) + self.raised.append(ValidationError( + nameformat, "se must not be NA")) # Find rows not in the allowed range for se. result = df_to_test.query( @@ -240,7 +237,7 @@ def check_bad_se(self, df_to_test, missing_se_allowed): if not result.empty: self.raised.append(ValidationError( - None, "se must be in (0, min(50,val*(1+eps))]")) + nameformat, "se must be in (0, min(50,val*(1+eps))]")) elif missing_se_allowed: result = df_to_test.query( @@ -248,15 +245,15 @@ def check_bad_se(self, df_to_test, missing_se_allowed): if not result.empty: self.raised.append(ValidationError( - None, "se must be NA or in (0, min(50,val*(1+eps))]")) + nameformat, "se must be NA or in (0, min(50,val*(1+eps))]")) result = df_to_test.query('(val == 0) & (se == 0)') if not result.empty: self.raised.append(ValidationError( - None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) + nameformat, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) - def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_size_allowed): + def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, missing_sample_size_allowed): """ Check sample sizes for validity. @@ -271,14 +268,14 @@ def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_ if not missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): self.raised.append(ValidationError( - None, "sample_size must not be NA")) + nameformat, "sample_size must not be NA")) # Find rows with sample size less than minimum allowed result = df_to_test.query('(sample_size < @minimum_sample_size)') if not result.empty: self.raised.append(ValidationError( - None, "sample size must be >= {minimum_sample_size}")) + nameformat, "sample size must be >= {minimum_sample_size}")) elif missing_sample_size_allowed: result = df_to_test.query( @@ -286,9 +283,9 @@ def check_bad_sample_size(self, df_to_test, minimum_sample_size, missing_sample_ if not result.empty: self.raised.append(ValidationError( - None, "sample size must be NA or >= {minimum_sample_size}")) + nameformat, "sample size must be NA or >= {minimum_sample_size}")) - def check_min_allowed_max_date(self, max_date, generation_date, weighted_option='unweighted'): + def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, geo, sig): """ Check if time since data was generated is reasonable or too long ago. @@ -310,9 +307,9 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option= if max_date < generation_date - thres: self.raised.append(ValidationError( - None, "most recent date of generated file seems too long ago")) + (geo, sig, max_date), "most recent date of generated file seems too long ago")) - def check_max_allowed_max_date(self, max_date, generation_date): + def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): """ Check if time since data was generated is reasonable or too recent. @@ -325,9 +322,9 @@ def check_max_allowed_max_date(self, max_date, generation_date): """ if max_date > generation_date - timedelta(days=1): self.raised.append(ValidationError( - None, "most recent date of generated file seems too recent")) + (geo, sig, max_date), "most recent date of generated file seems too recent")) - def check_max_date_vs_reference(self, df_to_test, df_to_reference): + def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo, sig): """ Check if reference data is more recent than test data. @@ -338,14 +335,14 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference): Returns: - None """ - if df_to_test["date"].max() < df_to_reference["date"].max(): - self.raised.append(ValidationError((df_to_test["date"].max(), df_to_reference["date"].max()), + if df_to_test["time_value"].max() < df_to_reference["time_value"].max(): + self.raised.append(ValidationError((checking_date.date(), geo, sig, df_to_test["time_value"].max(), df_to_reference["time_value"].max()), 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + 'to handle this case, and this situation may indicate that something locally is out of date,' + 'or, if the local working files have already been compared against the reference,' + 'that there is a bug somewhere')) - def check_rapid_change(self, df_to_test, df_to_reference, checking_date, sig, geo): + def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, sig): """ Compare number of obervations per day in test dataframe vs reference dataframe. @@ -366,10 +363,10 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, sig, ge set(df_to_reference["time_value"])) if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: - self.raised.append(ValidationError((checking_date, sig, geo), + self.raised.append(ValidationError((checking_date.date(), sig, geo), "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs test data)")) - def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): + def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checking_date, geo, sig): """ Compare average values for each variable in test dataframe vs reference dataframe. @@ -436,7 +433,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option): flag = mean_stddiff_high or mean_stdabsdiff_high if flag: - self.raised.append(ValidationError((mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data (either semirecent or from API) seem' + self.raised.append(ValidationError((checking_date.date(), sig, geo, mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data (either semirecent or from API) seem' + 'large --- either large increase tending toward one direction or large mean absolute' + 'difference, relative to average values of corresponding variables. For the former' + 'check, tolerances for `val` are more restrictive than those for other columns.')) @@ -467,6 +464,7 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera sanity_check_rows_per_day = params.get( 'sanity_check_rows_per_day', True) sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) + # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 check_vs_working = params.get('check_vs_working', True) # Get relevant data file names and info. @@ -491,13 +489,11 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera self.validate_daily( df, filename, max_check_lookbehind, generation_date) - self.check_bad_geo_id(df, match.groupdict()['geo_type']) - self.check_bad_val(df, match.groupdict()['signal']) - self.check_bad_se(df, missing_se_allowed) + self.check_bad_geo_id(df, filename, match.groupdict()['geo_type']) + self.check_bad_val(df, filename, match.groupdict()['signal']) + self.check_bad_se(df, filename, missing_se_allowed) self.check_bad_sample_size( - df, minimum_sample_size, missing_sample_size_allowed) - - # TODO: Check to see, if this date is in the API, values have been updated and changed significantly. + df, filename, minimum_sample_size, missing_sample_size_allowed) # Get geo_type, date, and signal name as specified by CSV name. df['geo_type'] = match.groupdict()['geo_type'] @@ -547,8 +543,13 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera max_date = geo_sig_df["time_value"].max() self.check_min_allowed_max_date( - max_date, generation_date, weight_option) - self.check_max_allowed_max_date(max_date, generation_date) + max_date, generation_date, weight_option, geo, sig) + self.check_max_allowed_max_date( + max_date, generation_date, geo, sig) + + # TODO: Check to see, if this date is in the API, if values have been updated and changed significantly. + + # TODO: Compare data against long-ago (3 months?) API data for changes in trends. # Check data from a group of dates against recent (previous 7 days, by default) data from the API. for checking_date in date_list: @@ -575,15 +576,18 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera if recent_df["se"].isnull().mean() > 0.5: self.raised.append( - ((sig, geo, checking_date), 'Recent se values are >50% NA')) + ((checking_date.date(), geo, sig), 'Recent se values are >50% NA')) + + self.check_max_date_vs_reference( + recent_df, reference_api_df, checking_date, geo, sig) if sanity_check_rows_per_day: self.check_rapid_change( - recent_df, reference_api_df, checking_date, sig, geo) + recent_df, reference_api_df, checking_date, geo, sig) if sanity_check_value_diffs: self.check_avg_val_diffs( - recent_df, reference_api_df, smooth_option) + recent_df, reference_api_df, smooth_option, checking_date, geo, sig) # TODO: Remove for actual version kroc += 1 From 72e4b508fe4d913eb6d87afb013b5d3f79fd6b6b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 2 Oct 2020 13:41:51 -0400 Subject: [PATCH 061/151] add current and planned features doc --- validator/PLANS.md | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 validator/PLANS.md diff --git a/validator/PLANS.md b/validator/PLANS.md new file mode 100644 index 000000000..be882d90e --- /dev/null +++ b/validator/PLANS.md @@ -0,0 +1,47 @@ +# Validator checks and features + +## Current checks for indicator source data + +* Appropriate file name +* Recognized geographical type (county, state, etc) +* Recognized geo id format (e.g. state is two lowercase letters) +* Missing geo type + signal + date combos based on the geo type + signal combos Covidcast metadata says should be available +* Missing ‘val’ values +* Negative ‘val’ values +* Out-of-range ‘val’ values (>0 for all signals, <=100 for percents, <=100 000 for proportions) +* Missing ‘se’ values +* Appropriate ‘se’ values, within a calculated reasonable range +* Stderr != 0 +* If signal and stderr both = 0 (seen in Quidel data due to lack of Jeffreys correction, issue 255) +* Missing ‘sample_size’ values +* Appropriate ‘sample_size’ values, ≥ 100 (default) or user-defined threshold +* Similar number of obs per day as recent API data +* Similar average values as API data +* Missing dates within the selected range + +## Current features + +* Errors are summarized in class attribute +* Various check settings are controllable via indicator-specific params.json files + +## Checks + features wishlist, and problems to think about: + +* check for large jumps +* Which, if any, specific geo_ids are missing +* different thresholds for different files? +* flags to disable certain checks +* re-adjust thresholds to not pass for some previous days that did have issues +* check number of observations +* add tests +* check for duplicate rows +* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data (JHU examples) or is very different from the value it replaced +* Data correctness and consistency over longer time periods (weeks to months) + * Long-term trends + * Currently, checks look at a data window of a few days + * Ryan’s correlation notebook for ideas + * E.g. Doctor visits decreasing correlation with cases + * E.g. WY/RI missing or very low compared to historical +* Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. +* Order raised exceptions by p-value, correcting for multiple testing +* Nicer formatting for error “report” +* Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first From 9950104c6ebee60bec2572b8a500a6b07f5219f7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 3 Oct 2020 09:28:44 -0400 Subject: [PATCH 062/151] Added uniquely identifying check ids in standard format. Plan to use for manual check suppression --- validator/PLANS.md | 8 +- validator/delphi_validator/validate.py | 144 +++++++++++++++++-------- 2 files changed, 104 insertions(+), 48 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index be882d90e..4a548a4ad 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -12,7 +12,7 @@ * Missing ‘se’ values * Appropriate ‘se’ values, within a calculated reasonable range * Stderr != 0 -* If signal and stderr both = 0 (seen in Quidel data due to lack of Jeffreys correction, issue 255) +* If signal and stderr both = 0 (seen in Quidel data due to lack of Jeffreys correction, [issue 255](https://github.com/cmu-delphi/covidcast-indicators/issues/255#issuecomment-692196541)) * Missing ‘sample_size’ values * Appropriate ‘sample_size’ values, ≥ 100 (default) or user-defined threshold * Similar number of obs per day as recent API data @@ -32,13 +32,13 @@ * flags to disable certain checks * re-adjust thresholds to not pass for some previous days that did have issues * check number of observations -* add tests +* tests * check for duplicate rows -* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data (JHU examples) or is very different from the value it replaced +* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced * Data correctness and consistency over longer time periods (weeks to months) * Long-term trends * Currently, checks look at a data window of a few days - * Ryan’s correlation notebook for ideas + * Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks) for ideas * E.g. Doctor visits decreasing correlation with cases * E.g. WY/RI missing or very low compared to historical * Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 8bb859d66..57de8be9d 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -36,12 +36,14 @@ def reldiff_by_min(x, y): class ValidationError(Exception): """ Error raised when validation check fails. """ - def __init__(self, expression, message): + def __init__(self, check_data_id, expression, message): """ Arguments: - - expression: relevant variables to message, e.g., if a date doesn't pass a check, provide a list of the date and the filename of the CSV it originated from + - check_data_id: str or tuple/list of str uniquely identifying the check that was run and on what data + - expression: relevant variables to message, e.g., if a date doesn't pass a check, provide the date - message: str explaining why an error was raised """ + self.check_data_id = tuple(check_data_id) self.expression = expression self.message = message @@ -104,20 +106,30 @@ def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generatio """ if not isinstance(max_check_lookbehind, timedelta): self.raised.append(ValidationError( - max_check_lookbehind, "max_check_lookbehind must be of type datetime.timedelta")) + ("check_type_max_check_lookbehind", nameformat), + max_check_lookbehind, + "max_check_lookbehind must be of type datetime.timedelta")) - if not isinstance(generation_date, date) or generation_date > date.today(): + if not isinstance(generation_date, date): self.raised.append(ValidationError( - generation_date, "generation_date must be a datetime.date type and not in the future.")) + ("check_type_generation_date", nameformat), generation_date, + "generation_date must be a datetime.date type")) + + if not generation_date > date.today(): + self.raised.append(ValidationError( + ("check_future_generation_date", nameformat), generation_date, + "generation_date must not be in the future")) pattern_found = filename_regex.match(nameformat) if not nameformat or not pattern_found: self.raised.append(ValidationError( + ("check_filename_format", nameformat), nameformat, 'nameformat not recognized')) if not isinstance(df_to_test, pd.DataFrame): self.raised.append(ValidationError( - nameformat, 'df_to_test must be a pandas dataframe.')) + ("check_file_data_format", nameformat), + type(df_to_test), 'df_to_test must be a pandas dataframe.')) def check_missing_dates(self, daily_filenames, start_date, end_date): """ @@ -146,7 +158,10 @@ def check_missing_dates(self, daily_filenames, start_date, end_date): if check_dateholes: self.raised.append(ValidationError( - check_dateholes, "Missing dates are observed; if these dates are already in the API they would not be updated")) + "check_missing_date_files", + check_dateholes, + "Missing dates are observed; if these dates are" + + " already in the API they would not be updated")) def check_bad_geo_id(self, df_to_test, nameformat, geo_type): """ @@ -161,7 +176,8 @@ def check_bad_geo_id(self, df_to_test, nameformat, geo_type): """ if geo_type not in negated_regex_dict: self.raised.append(ValidationError( - (nameformat, geo_type), "Unrecognized geo type")) + ("check_geo_type", nameformat), + geo_type, "Unrecognized geo type")) def find_all_unexpected_geo_ids(df_to_test, negated_regex): """ @@ -171,7 +187,8 @@ def find_all_unexpected_geo_ids(df_to_test, negated_regex): negated_regex) if len(ugeo) > 0] if len(unexpected_geos) > 0: self.raised.append(ValidationError( - (nameformat, unexpected_geos), "Non-conforming geo_ids found")) + ("check_geo_id_format", nameformat), + unexpected_geos, "Non-conforming geo_ids found")) find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) @@ -193,20 +210,27 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: self.raised.append(ValidationError( - nameformat, "val column can't have any cell greater than 100 for percents")) + ("check_val_pct_gt_100", nameformat), + df_to_test[(df_to_test['val'] > 100)], + "val column can't have any cell greater than 100 for percents")) if proportion_option: if not df_to_test[(df_to_test['val'] > 100000)].empty: self.raised.append(ValidationError( - signal_type, "val column can't have any cell greater than 100000 for nameformat")) + ("check_val_prop_gt_100k", nameformat), + df_to_test[(df_to_test['val'] > 100000)], + "val column can't have any cell greater than 100000 for nameformat")) if df_to_test['val'].isnull().values.any(): self.raised.append(ValidationError( - nameformat, "val column can't have any cell that is NA")) + ("check_val_missing", nameformat), + None, "val column can't have any cell that is NA")) if not df_to_test[(df_to_test['val'] < 0)].empty: self.raised.append(ValidationError( - nameformat, "val column can't have any cell smaller than 0")) + ("check_val_lt_0", nameformat), + df_to_test[(df_to_test['val'] < 0)], + "val column can't have any cell smaller than 0")) def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): """ @@ -229,7 +253,8 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): if not missing_se_allowed: if df_to_test['se'].isnull().values.any(): self.raised.append(ValidationError( - nameformat, "se must not be NA")) + ("check_se_missing", nameformat), + None, "se must not be NA")) # Find rows not in the allowed range for se. result = df_to_test.query( @@ -237,7 +262,8 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): if not result.empty: self.raised.append(ValidationError( - nameformat, "se must be in (0, min(50,val*(1+eps))]")) + ("check_se_in_range", nameformat), + result, "se must be in (0, min(50,val*(1+eps))]")) elif missing_se_allowed: result = df_to_test.query( @@ -245,13 +271,20 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): if not result.empty: self.raised.append(ValidationError( - nameformat, "se must be NA or in (0, min(50,val*(1+eps))]")) + ("check_se_missing_or_in_range", nameformat), + result, "se must be NA or in (0, min(50,val*(1+eps))]")) - result = df_to_test.query('(val == 0) & (se == 0)') + result_jeffreys = df_to_test.query('(val == 0) & (se == 0)') + result_alt = df_to_test.query('se == 0') - if not result.empty: + if not result_jeffreys.empty: self.raised.append(ValidationError( - nameformat, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) + ("check_se_0_when_val_0", nameformat), + None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) + elif not result_alt.empty: + self.raised.append(ValidationError( + ("check_se_0", nameformat), + result_alt, "se must be non-zero")) def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, missing_sample_size_allowed): """ @@ -268,14 +301,16 @@ def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, mis if not missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): self.raised.append(ValidationError( - nameformat, "sample_size must not be NA")) + ("check_n_missing", nameformat), + None, "sample_size must not be NA")) # Find rows with sample size less than minimum allowed result = df_to_test.query('(sample_size < @minimum_sample_size)') if not result.empty: self.raised.append(ValidationError( - nameformat, "sample size must be >= {minimum_sample_size}")) + ("check_n_gt_min", nameformat), + result, "sample size must be >= {minimum_sample_size}")) elif missing_sample_size_allowed: result = df_to_test.query( @@ -283,7 +318,9 @@ def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, mis if not result.empty: self.raised.append(ValidationError( - nameformat, "sample size must be NA or >= {minimum_sample_size}")) + ("check_n_missing_or_gt_min", nameformat), + result, + "sample size must be NA or >= {minimum_sample_size}")) def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, geo, sig): """ @@ -307,7 +344,9 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, if max_date < generation_date - thres: self.raised.append(ValidationError( - (geo, sig, max_date), "most recent date of generated file seems too long ago")) + ("check_min_max_date", geo, sig), + max_date, + "most recent date of generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): """ @@ -322,7 +361,9 @@ def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): """ if max_date > generation_date - timedelta(days=1): self.raised.append(ValidationError( - (geo, sig, max_date), "most recent date of generated file seems too recent")) + ("check_max_max_date", geo, sig), + max_date, + "most recent date of generated file seems too recent")) def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo, sig): """ @@ -336,11 +377,15 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date - None """ if df_to_test["time_value"].max() < df_to_reference["time_value"].max(): - self.raised.append(ValidationError((checking_date.date(), geo, sig, df_to_test["time_value"].max(), df_to_reference["time_value"].max()), - 'reference df has days beyond the max date in the =df_to_test=; checks are not constructed' + - 'to handle this case, and this situation may indicate that something locally is out of date,' + - 'or, if the local working files have already been compared against the reference,' + - 'that there is a bug somewhere')) + self.raised.append(ValidationError( + ("check_max_date_vs_reference", checking_date.date(), geo, sig), + (df_to_test["time_value"].max(), + df_to_reference["time_value"].max()), + 'reference df has days beyond the max date in the =df_to_test=; ' + + 'checks are not constructed to handle this case, and this situation ' + + 'may indicate that something locally is out of date, or, if the local ' + + 'working files have already been compared against the reference, ' + + 'that there is a bug somewhere')) def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, sig): """ @@ -363,8 +408,11 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, si set(df_to_reference["time_value"])) if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: - self.raised.append(ValidationError((checking_date.date(), sig, geo), - "Number of rows per day (-with-any-rows) seems to have changed rapidly (reference vs test data)")) + self.raised.append(ValidationError( + ("check_rapid_change_num_rows", checking_date.date(), geo, sig), + (test_rows_per_reporting_day, reference_rows_per_reporting_day), + "Number of rows per day (-with-any-rows) seems to have changed " + + "rapidly (reference vs test data)")) def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checking_date, geo, sig): """ @@ -378,7 +426,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki Returns: - None """ - # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA values by default. + # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA by default. df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[ ['val', 'se', 'sample_size']].mean() df_to_test["type"] = "test" @@ -390,10 +438,11 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki df_all = pd.concat([df_to_test, df_to_reference]) # For each variable (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. - df_all = pd.melt(df_all, id_vars=["geo_id", "type"], value_vars=["val", "se", "sample_size"] - ).pivot(index=("geo_id", "variable"), columns="type", values="value" - ).reset_index(("geo_id", "variable") - ).dropna( + df_all = pd.melt( + df_all, id_vars=["geo_id", "type"], value_vars=["val", "se", "sample_size"] + ).pivot(index=("geo_id", "variable"), columns="type", values="value" + ).reset_index(("geo_id", "variable") + ).dropna( ).assign( type_diff=lambda x: x["test"] - x["reference"], abs_type_diff=lambda x: abs(x["type_diff"]) @@ -433,10 +482,14 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki flag = mean_stddiff_high or mean_stdabsdiff_high if flag: - self.raised.append(ValidationError((checking_date.date(), sig, geo, mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data (either semirecent or from API) seem' - + 'large --- either large increase tending toward one direction or large mean absolute' - + 'difference, relative to average values of corresponding variables. For the former' - + 'check, tolerances for `val` are more restrictive than those for other columns.')) + self.raised.append(ValidationError( + ("check_test_vs_reference_avg_changed", + checking_date.date(), geo, sig), + (mean_stddiff_high, mean_stdabsdiff_high), + 'Average differences in variables by geo_id between recent & reference data (either semirecent or from API) seem' + + 'large --- either large increase tending toward one direction or large mean absolute' + + 'difference, relative to average values of corresponding variables. For the former' + + 'check, tolerances for `val` are more restrictive than those for other columns.')) def validate(self, export_dir, start_date, end_date, data_source, params, generation_date=date.today()): """ @@ -467,6 +520,8 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 check_vs_working = params.get('check_vs_working', True) + suppressed_errors = set(params.get('suppressed_errors', [])) + # Get relevant data file names and info. export_files = read_filenames(export_dir) date_filter = self.make_date_filter(start_date, end_date) @@ -575,8 +630,9 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera ['direction', 'issue', 'lag'], axis=1).reindex(columns=column_names) if recent_df["se"].isnull().mean() > 0.5: - self.raised.append( - ((checking_date.date(), geo, sig), 'Recent se values are >50% NA')) + self.raised.append(ValidationError( + ("check_se_many_missing", checking_date.date(), geo, sig), + None, 'Recent se values are >50% NA')) self.check_max_date_vs_reference( recent_df, reference_api_df, checking_date, geo, sig) @@ -594,9 +650,9 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera if kroc == 2: break - self.exit() + self.exit(suppressed_errors) - def exit(self): + def exit(self, suppressed_errors): """ If any exceptions were raised, print and exit with non-zero status. """ From ca0697f604059a4bff35ba2209e115d36107ab20 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 5 Oct 2020 10:52:45 -0400 Subject: [PATCH 063/151] Added API fetch and formatting function. Fixed few small bugs --- validator/delphi_validator/datafetcher.py | 35 ++++++++++++++++------- validator/delphi_validator/validate.py | 33 ++++++++------------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index ed70051d8..0678dad27 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -10,6 +10,7 @@ from typing import List from itertools import product import pandas as pd +import numpy as np import covidcast from .errors import APIDataFetchError @@ -113,16 +114,30 @@ def load_csv(path): }) -def fetch_daily_data(data_source, survey_date, geo_type, signal): +def fetch_api_reference(data_source, start_date, end_date, geo, sig): """ - Get API data for a specified date, source, signal, and geo type. + Get and process API data for use as a reference. Formatting is changed to match that of source data CSVs. """ - data_to_reference = covidcast.signal( - data_source, signal, survey_date, survey_date, geo_type) - if not isinstance(data_to_reference, pd.DataFrame): - custom_msg = "Error fetching data on" + str(survey_date) + \ - "for data source:" + data_source + \ - ", signal-type:" + signal + \ - ", geography-type:" + geo_type + api_df = covidcast.signal( + data_source, sig, start_date, end_date, geo) + + if not isinstance(api_df, pd.DataFrame): + custom_msg = "Error fetching data from " + str(survey_date) + \ + " to " + str(end_date) + \ + "for data source: " + data_source + \ + ", signal-type: " + sig + \ + ", geography-type: " + geo + raise APIDataFetchError(custom_msg) - return data_to_reference + + column_names = ["geo_id", "val", + "se", "sample_size", "time_value"] + + # Replace None with NA to make numerical manipulation easier. + # Rename and reorder columns to match those in df_to_test. + api_df = api_df.replace( + to_replace=[None], value=np.nan).rename( + columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}).drop( + ['direction', 'issue', 'lag'], axis=1).reindex(columns=column_names) + + return(api_df) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 57de8be9d..c12c6a120 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -12,7 +12,7 @@ import numpy as np import covidcast -from .datafetcher import load_csv, read_geo_sig_cmbo_files, read_filenames, get_geo_sig_cmbo, filename_regex +from .datafetcher import * import pdb @@ -265,6 +265,11 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): ("check_se_in_range", nameformat), result, "se must be in (0, min(50,val*(1+eps))]")) + if df_to_test["se"].isnull().mean() > 0.5: + self.raised.append(ValidationError( + ("check_se_many_missing", nameformat), + None, 'Recent se values are >50% NA')) + elif missing_se_allowed: result = df_to_test.query( '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') @@ -345,7 +350,7 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, if max_date < generation_date - thres: self.raised.append(ValidationError( ("check_min_max_date", geo, sig), - max_date, + max_date.date(), "most recent date of generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): @@ -362,7 +367,7 @@ def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): if max_date > generation_date - timedelta(days=1): self.raised.append(ValidationError( ("check_max_max_date", geo, sig), - max_date, + max_date.date(), "most recent date of generated file seems too recent")) def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo, sig): @@ -520,7 +525,8 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 check_vs_working = params.get('check_vs_working', True) - suppressed_errors = set(params.get('suppressed_errors', [])) + suppressed_errors = set([tuple(item) + for item in params.get('suppressed_errors', [])]) # Get relevant data file names and info. export_files = read_filenames(export_dir) @@ -616,23 +622,8 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera reference_start_date = checking_date - \ min(semirecent_lookbehind, max_check_lookbehind) reference_end_date = recent_cutoff_date - timedelta(days=1) - reference_api_df = covidcast.signal( - data_source, sig, reference_start_date, reference_end_date, geo) - - column_names = ["geo_id", "val", - "se", "sample_size", "time_value"] - - # Replace None with NA to make numerical manipulation easier. - # Rename and reorder columns to match those in df_to_test. - reference_api_df = reference_api_df.replace( - to_replace=[None], value=np.nan).rename( - columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}).drop( - ['direction', 'issue', 'lag'], axis=1).reindex(columns=column_names) - - if recent_df["se"].isnull().mean() > 0.5: - self.raised.append(ValidationError( - ("check_se_many_missing", checking_date.date(), geo, sig), - None, 'Recent se values are >50% NA')) + reference_api_df = fetch_api_reference( + data_source, reference_start_date, reference_end_date, geo, sig) self.check_max_date_vs_reference( recent_df, reference_api_df, checking_date, geo, sig) From 5942cbb66b7169ca8324e539c530c5376b0577b6 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 5 Oct 2020 11:21:18 -0400 Subject: [PATCH 064/151] Moved params definitions to Validator init as attributes --- validator/delphi_validator/run.py | 11 +- validator/delphi_validator/validate.py | 208 +++++++++++++------------ 2 files changed, 111 insertions(+), 108 deletions(-) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index c7cca45ae..0ec67641b 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -13,12 +13,5 @@ def run_module(): parent_params = read_params() params = parent_params['validation'] - data_source = params['data_source'] - dtobj_sdate = datetime.date( - datetime.strptime(params['start_date'], '%Y-%m-%d')) - dtobj_edate = datetime.date( - datetime.strptime(params['end_date'], '%Y-%m-%d')) - - validator = Validator() - validator.validate(parent_params["export_dir"], - dtobj_sdate, dtobj_edate, data_source, params) + validator = Validator(params) + validator.validate(parent_params["export_dir"]) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index c12c6a120..0346b2f2c 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -51,8 +51,33 @@ def __init__(self, check_data_id, expression, message): class Validator(): """ Class containing validation() function and supporting functions. Stores a list of all raised errors and warnings. """ - def __init__(self): - self.raised = [] + def __init__(self, params, generation_date=date.today()): + # Get user settings from params or if not provided, set default. + self.data_source = params['data_source'] + self.start_date = datetime.date( + datetime.strptime(params['start_date'], '%Y-%m-%d')) + self.end_date = datetime.date( + datetime.strptime(params['end_date'], '%Y-%m-%d')) + self.generation_date = generation_date + + self.max_check_lookbehind = timedelta( + days=params.get("ref_window_size", 7)) + self.minimum_sample_size = params.get('minimum_sample_size', 100) + self.missing_se_allowed = params.get('missing_se_allowed', False) + self.missing_sample_size_allowed = params.get( + 'missing_sample_size_allowed', False) + + self.sanity_check_rows_per_day = params.get( + 'sanity_check_rows_per_day', True) + self.sanity_check_value_diffs = params.get( + 'sanity_check_value_diffs', True) + # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 + self.check_vs_working = params.get('check_vs_working', True) + + self.suppressed_errors = set([tuple(item) + for item in params.get('suppressed_errors', [])]) + + self.raised_errors = [] def make_date_filter(self, start_date, end_date): """ @@ -91,46 +116,6 @@ def f(match): return f - def validate_daily(self, df_to_test, nameformat, max_check_lookbehind, generation_date): - """ - Perform some automated format & sanity checks of inputs. - - Arguments: - - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" - - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test - - Returns: - - None - """ - if not isinstance(max_check_lookbehind, timedelta): - self.raised.append(ValidationError( - ("check_type_max_check_lookbehind", nameformat), - max_check_lookbehind, - "max_check_lookbehind must be of type datetime.timedelta")) - - if not isinstance(generation_date, date): - self.raised.append(ValidationError( - ("check_type_generation_date", nameformat), generation_date, - "generation_date must be a datetime.date type")) - - if not generation_date > date.today(): - self.raised.append(ValidationError( - ("check_future_generation_date", nameformat), generation_date, - "generation_date must not be in the future")) - - pattern_found = filename_regex.match(nameformat) - if not nameformat or not pattern_found: - self.raised.append(ValidationError( - ("check_filename_format", nameformat), - nameformat, 'nameformat not recognized')) - - if not isinstance(df_to_test, pd.DataFrame): - self.raised.append(ValidationError( - ("check_file_data_format", nameformat), - type(df_to_test), 'df_to_test must be a pandas dataframe.')) - def check_missing_dates(self, daily_filenames, start_date, end_date): """ Check for missing dates between the specified start and end dates. @@ -157,12 +142,53 @@ def check_missing_dates(self, daily_filenames, start_date, end_date): check_dateholes.sort() if check_dateholes: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( "check_missing_date_files", check_dateholes, "Missing dates are observed; if these dates are" + " already in the API they would not be updated")) + def check_settings(self, max_check_lookbehind, generation_date): + """ + Perform some automated format & sanity checks of inputs. + + Arguments: + - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" + - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + + Returns: + - None + """ + if not isinstance(max_check_lookbehind, timedelta): + self.raised_errors.append(ValidationError( + ("check_type_max_check_lookbehind"), + max_check_lookbehind, + "max_check_lookbehind must be of type datetime.timedelta")) + + if not isinstance(generation_date, date): + self.raised_errors.append(ValidationError( + ("check_type_generation_date"), generation_date, + "generation_date must be a datetime.date type")) + + if generation_date > date.today(): + self.raised_errors.append(ValidationError( + ("check_future_generation_date"), generation_date, + "generation_date must not be in the future")) + + def check_df_format(self, df_to_test, nameformat): + pattern_found = filename_regex.match(nameformat) + if not nameformat or not pattern_found: + self.raised_errors.append(ValidationError( + ("check_filename_format", nameformat), + nameformat, 'nameformat not recognized')) + + if not isinstance(df_to_test, pd.DataFrame): + self.raised_errors.append(ValidationError( + ("check_file_data_format", nameformat), + type(df_to_test), 'df_to_test must be a pandas dataframe.')) + def check_bad_geo_id(self, df_to_test, nameformat, geo_type): """ Check validity of geo type and values, according to regex pattern. @@ -175,7 +201,7 @@ def check_bad_geo_id(self, df_to_test, nameformat, geo_type): - None """ if geo_type not in negated_regex_dict: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_geo_type", nameformat), geo_type, "Unrecognized geo type")) @@ -186,7 +212,7 @@ def find_all_unexpected_geo_ids(df_to_test, negated_regex): unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall( negated_regex) if len(ugeo) > 0] if len(unexpected_geos) > 0: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_geo_id_format", nameformat), unexpected_geos, "Non-conforming geo_ids found")) @@ -209,25 +235,25 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_val_pct_gt_100", nameformat), df_to_test[(df_to_test['val'] > 100)], "val column can't have any cell greater than 100 for percents")) if proportion_option: if not df_to_test[(df_to_test['val'] > 100000)].empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_val_prop_gt_100k", nameformat), df_to_test[(df_to_test['val'] > 100000)], "val column can't have any cell greater than 100000 for nameformat")) if df_to_test['val'].isnull().values.any(): - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_val_missing", nameformat), None, "val column can't have any cell that is NA")) if not df_to_test[(df_to_test['val'] < 0)].empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_val_lt_0", nameformat), df_to_test[(df_to_test['val'] < 0)], "val column can't have any cell smaller than 0")) @@ -252,7 +278,7 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): if not missing_se_allowed: if df_to_test['se'].isnull().values.any(): - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_se_missing", nameformat), None, "se must not be NA")) @@ -261,12 +287,12 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): '~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_se_in_range", nameformat), result, "se must be in (0, min(50,val*(1+eps))]")) if df_to_test["se"].isnull().mean() > 0.5: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_se_many_missing", nameformat), None, 'Recent se values are >50% NA')) @@ -275,7 +301,7 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_se_missing_or_in_range", nameformat), result, "se must be NA or in (0, min(50,val*(1+eps))]")) @@ -283,11 +309,11 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): result_alt = df_to_test.query('se == 0') if not result_jeffreys.empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_se_0_when_val_0", nameformat), None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) elif not result_alt.empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_se_0", nameformat), result_alt, "se must be non-zero")) @@ -305,7 +331,7 @@ def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, mis """ if not missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_n_missing", nameformat), None, "sample_size must not be NA")) @@ -313,7 +339,7 @@ def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, mis result = df_to_test.query('(sample_size < @minimum_sample_size)') if not result.empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_n_gt_min", nameformat), result, "sample size must be >= {minimum_sample_size}")) @@ -322,7 +348,7 @@ def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, mis '~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') if not result.empty: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_n_missing_or_gt_min", nameformat), result, "sample size must be NA or >= {minimum_sample_size}")) @@ -348,7 +374,7 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, weighted_option, lambda: "Invalid weighting option") if max_date < generation_date - thres: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_min_max_date", geo, sig), max_date.date(), "most recent date of generated file seems too long ago")) @@ -365,7 +391,7 @@ def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): - None """ if max_date > generation_date - timedelta(days=1): - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_max_max_date", geo, sig), max_date.date(), "most recent date of generated file seems too recent")) @@ -382,7 +408,7 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date - None """ if df_to_test["time_value"].max() < df_to_reference["time_value"].max(): - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_max_date_vs_reference", checking_date.date(), geo, sig), (df_to_test["time_value"].max(), df_to_reference["time_value"].max()), @@ -413,7 +439,7 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, si set(df_to_reference["time_value"])) if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_rapid_change_num_rows", checking_date.date(), geo, sig), (test_rows_per_reporting_day, reference_rows_per_reporting_day), "Number of rows per day (-with-any-rows) seems to have changed " + @@ -487,7 +513,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki flag = mean_stddiff_high or mean_stdabsdiff_high if flag: - self.raised.append(ValidationError( + self.raised_errors.append(ValidationError( ("check_test_vs_reference_avg_changed", checking_date.date(), geo, sig), (mean_stddiff_high, mean_stdabsdiff_high), @@ -496,7 +522,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki + 'difference, relative to average values of corresponding variables. For the former' + 'check, tolerances for `val` are more restrictive than those for other columns.')) - def validate(self, export_dir, start_date, end_date, data_source, params, generation_date=date.today()): + def validate(self, export_dir): """ Runs all data checks. @@ -511,35 +537,20 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera Returns: - None """ - # Setup - # Get user settings from params or if not provided, set default. - max_check_lookbehind = timedelta(days=params.get("ref_window_size", 7)) - minimum_sample_size = params.get('minimum_sample_size', 100) - missing_se_allowed = params.get('missing_se_allowed', False) - missing_sample_size_allowed = params.get( - 'missing_sample_size_allowed', False) - - sanity_check_rows_per_day = params.get( - 'sanity_check_rows_per_day', True) - sanity_check_value_diffs = params.get('sanity_check_value_diffs', True) - # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 - check_vs_working = params.get('check_vs_working', True) - - suppressed_errors = set([tuple(item) - for item in params.get('suppressed_errors', [])]) - # Get relevant data file names and info. export_files = read_filenames(export_dir) - date_filter = self.make_date_filter(start_date, end_date) + date_filter = self.make_date_filter(self.start_date, self.end_date) # List of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] # Get list of just CSV names. filenames = [name_match_pair[0] for name_match_pair in validate_files] # Get all expected combinations of geo_type and signal. - geo_sig_cmbo = get_geo_sig_cmbo(data_source) + geo_sig_cmbo = get_geo_sig_cmbo(self.data_source) - self.check_missing_dates(validate_files, start_date, end_date) + self.check_missing_dates( + validate_files, self.start_date, self.end_date) + self.check_settings(self.max_check_lookbehind, self.generation_date) all_frames = [] @@ -548,13 +559,12 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera for filename, match in validate_files: df = load_csv(join(export_dir, filename)) - self.validate_daily( - df, filename, max_check_lookbehind, generation_date) + self.check_df_format(df, filename) self.check_bad_geo_id(df, filename, match.groupdict()['geo_type']) self.check_bad_val(df, filename, match.groupdict()['signal']) - self.check_bad_se(df, filename, missing_se_allowed) + self.check_bad_se(df, filename, self.missing_se_allowed) self.check_bad_sample_size( - df, filename, minimum_sample_size, missing_sample_size_allowed) + df, filename, self.minimum_sample_size, self.missing_sample_size_allowed) # Get geo_type, date, and signal name as specified by CSV name. df['geo_type'] = match.groupdict()['geo_type'] @@ -604,9 +614,9 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera max_date = geo_sig_df["time_value"].max() self.check_min_allowed_max_date( - max_date, generation_date, weight_option, geo, sig) + max_date, self.generation_date, weight_option, geo, sig) self.check_max_allowed_max_date( - max_date, generation_date, geo, sig) + max_date, self.generation_date, geo, sig) # TODO: Check to see, if this date is in the API, if values have been updated and changed significantly. @@ -620,19 +630,19 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera # Reference dataframe runs backwards from the checking_date reference_start_date = checking_date - \ - min(semirecent_lookbehind, max_check_lookbehind) + min(semirecent_lookbehind, self.max_check_lookbehind) reference_end_date = recent_cutoff_date - timedelta(days=1) reference_api_df = fetch_api_reference( - data_source, reference_start_date, reference_end_date, geo, sig) + self.data_source, reference_start_date, reference_end_date, geo, sig) self.check_max_date_vs_reference( recent_df, reference_api_df, checking_date, geo, sig) - if sanity_check_rows_per_day: + if self.sanity_check_rows_per_day: self.check_rapid_change( recent_df, reference_api_df, checking_date, geo, sig) - if sanity_check_value_diffs: + if self.sanity_check_value_diffs: self.check_avg_val_diffs( recent_df, reference_api_df, smooth_option, checking_date, geo, sig) @@ -641,16 +651,16 @@ def validate(self, export_dir, start_date, end_date, data_source, params, genera if kroc == 2: break - self.exit(suppressed_errors) + self.exit() - def exit(self, suppressed_errors): + def exit(self): """ If any exceptions were raised, print and exit with non-zero status. """ - if self.raised: - print(len(self.raised), "messages") + if self.raised_errors: + print(len(self.raised_errors), "messages") - for message in self.raised: + for message in self.raised_errors: print(message) sys.exit(1) From 0f6c44317e1f369b7e52f5e8bb7c18de216ad9ea Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 5 Oct 2020 13:22:37 -0400 Subject: [PATCH 065/151] Added manual error suppression feature. Error messaging reports number of suppressed errors --- validator/delphi_validator/datafetcher.py | 58 +++++++++++------------ validator/delphi_validator/validate.py | 34 +++++++++---- 2 files changed, 55 insertions(+), 37 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 0678dad27..d35daec91 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -19,6 +19,35 @@ r'^(?P\d{8})_(?P\w+?)_(?P\w+)\.csv$') +def read_filenames(path): + """ + Return a list of tuples of every filename and regex match to the CSV filename format in the specified directory. + + Arguments: + - path: path to the directory containing CSV data files. + + Returns: + - list of tuples + """ + daily_filenames = [(f, filename_regex.match(f)) + for f in listdir(path) if isfile(join(path, f))] + return daily_filenames + + +def load_csv(path): + """ + Load CSV with specified column types. + """ + return pd.read_csv( + path, + dtype={ + 'geo_id': str, + 'val': float, + 'se': float, + 'sample_size': float, + }) + + def get_geo_sig_cmbo(data_source): """ Get list of geo type-signal type combinations that we expect to see, based on @@ -43,21 +72,6 @@ def get_geo_sig_cmbo(data_source): return geo_sig_cmbo -def read_filenames(path): - """ - Return a list of tuples of every filename and regex match to the CSV filename format in the specified directory. - - Arguments: - - path: path to the directory containing CSV data files. - - Returns: - - list of tuples - """ - daily_filenames = [(f, filename_regex.match(f)) - for f in listdir(path) if isfile(join(path, f))] - return daily_filenames - - def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): """ Generator that assembles data within the specified date range for a given geo_sig_cmbo. @@ -100,20 +114,6 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): yield pd.concat(df_list), geo_sig[0], geo_sig[1] -def load_csv(path): - """ - Load CSV with specified column types. - """ - return pd.read_csv( - path, - dtype={ - 'geo_id': str, - 'val': float, - 'se': float, - 'sample_size': float, - }) - - def fetch_api_reference(data_source, start_date, end_date, geo, sig): """ Get and process API data for use as a reference. Formatting is changed to match that of source data CSVs. diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 0346b2f2c..8f44f7674 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -43,7 +43,8 @@ def __init__(self, check_data_id, expression, message): - expression: relevant variables to message, e.g., if a date doesn't pass a check, provide the date - message: str explaining why an error was raised """ - self.check_data_id = tuple(check_data_id) + self.check_data_id = (check_data_id,) if not isinstance( + check_data_id, tuple) and not isinstance(check_data_id, list) else tuple(check_data_id) self.expression = expression self.message = message @@ -74,8 +75,8 @@ def __init__(self, params, generation_date=date.today()): # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 self.check_vs_working = params.get('check_vs_working', True) - self.suppressed_errors = set([tuple(item) - for item in params.get('suppressed_errors', [])]) + self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( + item, list) else tuple(item) for item in params.get('suppressed_errors', [])} self.raised_errors = [] @@ -655,14 +656,31 @@ def validate(self, export_dir): def exit(self): """ - If any exceptions were raised, print and exit with non-zero status. + If any not-suppressed exceptions were raised, print and exit with non-zero status. """ if self.raised_errors: - print(len(self.raised_errors), "messages") + suppressed_counter = 0 + subset_raised_errors = [] - for message in self.raised_errors: - print(message) + for val_error in set(self.raised_errors): + raised_check_id = tuple(item.strftime("%Y-%m-%d") if isinstance( + item, date) or isinstance(item, datetime) else item for item in val_error.check_data_id) - sys.exit(1) + if raised_check_id not in self.suppressed_errors: + subset_raised_errors.append(val_error) + else: + self.suppressed_errors.remove(raised_check_id) + suppressed_counter += 1 + + print(len(subset_raised_errors), "messages") + print(suppressed_counter, "suppressed messages") + + if len(subset_raised_errors) == 0: + sys.exit(0) + else: + for message in subset_raised_errors: + print(message) + + sys.exit(1) else: sys.exit(0) From 9d7779592bdaac1781b455de81e61bdff3706cc9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 5 Oct 2020 13:54:01 -0400 Subject: [PATCH 066/151] Propogate attribute usage throughout class. Update documentation --- validator/delphi_validator/validate.py | 154 ++++++++++++++----------- 1 file changed, 85 insertions(+), 69 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 8f44f7674..8e6f9210d 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -50,16 +50,37 @@ def __init__(self, check_data_id, expression, message): class Validator(): - """ Class containing validation() function and supporting functions. Stores a list of all raised errors and warnings. """ + """ Class containing validation() function and supporting functions. Stores a list of all raised errors, and user settings. """ - def __init__(self, params, generation_date=date.today()): + def __init__(self, params): + """ + Initialize object and set parameters. + + Arguments: + - params: dictionary of user settings; if empty, defaults will be used + + Attributes: + - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html + - start_date: beginning date of data to check, in datetime date format + - end_date: end date of data to check, in datetime date format + - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - minimum_sample_size: int + - missing_se_allowed: boolean indicating if missing standard errors should raise an exception or not + - missing_sample_size_allowed: boolean indicating if missing sample size should raise an exception or not + - sanity_check_rows_per_day: boolean; check flag + - sanity_check_value_diffs: boolean; check flag + - check_vs_working: boolean; check flag + - suppressed_errors: set of check_data_ids used to identify error messages to ignore + - raised_errors: list to append errors to as they are raised + """ # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] self.start_date = datetime.date( datetime.strptime(params['start_date'], '%Y-%m-%d')) self.end_date = datetime.date( datetime.strptime(params['end_date'], '%Y-%m-%d')) - self.generation_date = generation_date + self.generation_date = params.get('generation_date', date.today()) self.max_check_lookbehind = timedelta( days=params.get("ref_window_size", 7)) @@ -89,7 +110,7 @@ def make_date_filter(self, start_date, end_date): - end_date: datetime date object Returns: - - None + - Custom function object """ # Convert dates from datetime format to int. start_code = int(start_date.strftime("%Y%m%d")) @@ -117,22 +138,20 @@ def f(match): return f - def check_missing_dates(self, daily_filenames, start_date, end_date): + def check_missing_dates(self, daily_filenames): """ Check for missing dates between the specified start and end dates. Arguments: - daily_filenames: list of CSV source data filenames. - - sdate: start date, in datetime format - - edate: end date, in datetime format Returns: - None """ - number_of_dates = end_date - start_date + timedelta(days=1) + number_of_dates = self.end_date - self.start_date + timedelta(days=1) # Create set of all expected dates. - date_seq = {start_date + timedelta(days=x) + date_seq = {self.start_date + timedelta(days=x) for x in range(number_of_dates.days)} # Create set of all dates seen in CSV names. unique_dates = {datetime.strptime( @@ -149,36 +168,43 @@ def check_missing_dates(self, daily_filenames, start_date, end_date): "Missing dates are observed; if these dates are" + " already in the API they would not be updated")) - def check_settings(self, max_check_lookbehind, generation_date): + def check_settings(self): """ - Perform some automated format & sanity checks of inputs. + Perform some automated format & sanity checks of parameters. Arguments: - - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" - - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test + - None Returns: - None """ - if not isinstance(max_check_lookbehind, timedelta): + if not isinstance(self.max_check_lookbehind, timedelta): self.raised_errors.append(ValidationError( ("check_type_max_check_lookbehind"), - max_check_lookbehind, + self.max_check_lookbehind, "max_check_lookbehind must be of type datetime.timedelta")) - if not isinstance(generation_date, date): + if not isinstance(self.generation_date, date): self.raised_errors.append(ValidationError( - ("check_type_generation_date"), generation_date, + ("check_type_generation_date"), self.generation_date, "generation_date must be a datetime.date type")) - if generation_date > date.today(): + if self.generation_date > date.today(): self.raised_errors.append(ValidationError( - ("check_future_generation_date"), generation_date, + ("check_future_generation_date"), self.generation_date, "generation_date must not be in the future")) def check_df_format(self, df_to_test, nameformat): + """ + Check basic format of source data CSV df. + + Arguments: + - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" + + Returns: + - None + """ pattern_found = filename_regex.match(nameformat) if not nameformat or not pattern_found: self.raised_errors.append(ValidationError( @@ -259,13 +285,13 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): df_to_test[(df_to_test['val'] < 0)], "val column can't have any cell smaller than 0")) - def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): + def check_bad_se(self, df_to_test, nameformat): """ Check standard errors for validity. Arguments: - - df_to_test: pandas dataframe of CSV source data - - missing_se_allowed: boolean indicating if missing standard errors should raise an exception or not + - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" Returns: - None @@ -277,7 +303,7 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): df_to_test['se'] = df_to_test['se'].round(3) df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) - if not missing_se_allowed: + if not self.missing_se_allowed: if df_to_test['se'].isnull().values.any(): self.raised_errors.append(ValidationError( ("check_se_missing", nameformat), @@ -297,7 +323,7 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): ("check_se_many_missing", nameformat), None, 'Recent se values are >50% NA')) - elif missing_se_allowed: + elif self.missing_se_allowed: result = df_to_test.query( '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') @@ -318,50 +344,51 @@ def check_bad_se(self, df_to_test, nameformat, missing_se_allowed): ("check_se_0", nameformat), result_alt, "se must be non-zero")) - def check_bad_sample_size(self, df_to_test, nameformat, minimum_sample_size, missing_sample_size_allowed): + def check_bad_sample_size(self, df_to_test, nameformat): """ Check sample sizes for validity. Arguments: - - df_to_test: pandas dataframe of a single CSV of source data - - minimum_sample_size: int - - missing_sample_size_allowed: boolean indicating if missing sample size should raise an exception or not + - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" Returns: - None """ - if not missing_sample_size_allowed: + if not self.missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): self.raised_errors.append(ValidationError( ("check_n_missing", nameformat), None, "sample_size must not be NA")) # Find rows with sample size less than minimum allowed - result = df_to_test.query('(sample_size < @minimum_sample_size)') + result = df_to_test.query( + '(sample_size < @self.minimum_sample_size)') if not result.empty: self.raised_errors.append(ValidationError( ("check_n_gt_min", nameformat), - result, "sample size must be >= {minimum_sample_size}")) + result, "sample size must be >= {self.minimum_sample_size}")) - elif missing_sample_size_allowed: + elif self.missing_sample_size_allowed: result = df_to_test.query( - '~(sample_size.isnull() | (sample_size >= @minimum_sample_size))') + '~(sample_size.isnull() | (sample_size >= @self.minimum_sample_size))') if not result.empty: self.raised_errors.append(ValidationError( ("check_n_missing_or_gt_min", nameformat), result, - "sample size must be NA or >= {minimum_sample_size}")) + "sample size must be NA or >= {self.minimum_sample_size}")) - def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, geo, sig): + def check_min_allowed_max_date(self, max_date, weighted_option, geo, sig): """ Check if time since data was generated is reasonable or too long ago. Arguments: - max_date: date of most recent data to be validated; datetime format. - - generation_date: date data to test was generated; datetime format. - - weighted_option: str; selects the "reasonable" threshold + - weighted_option: str; selects the "reasonable" threshold based on signal name + - geo: str; geo type name (county, msa, hrr, state) as in the CSV name + - sig: str; signal name as in the CSV name Returns: - None @@ -374,24 +401,25 @@ def check_min_allowed_max_date(self, max_date, generation_date, weighted_option, thres = switcher.get( weighted_option, lambda: "Invalid weighting option") - if max_date < generation_date - thres: + if max_date < self.generation_date - thres: self.raised_errors.append(ValidationError( ("check_min_max_date", geo, sig), max_date.date(), "most recent date of generated file seems too long ago")) - def check_max_allowed_max_date(self, max_date, generation_date, geo, sig): + def check_max_allowed_max_date(self, max_date, geo, sig): """ Check if time since data was generated is reasonable or too recent. Arguments: - max_date: date of most recent data to be validated; datetime format. - - generation_date: date data to test was generated; datetime format. + - geo: str; geo type name (county, msa, hrr, state) as in the CSV name + - sig: str; signal name as in the CSV name Returns: - None """ - if max_date > generation_date - timedelta(days=1): + if max_date > self.generation_date - timedelta(days=1): self.raised_errors.append(ValidationError( ("check_max_max_date", geo, sig), max_date.date(), @@ -404,6 +432,8 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date Arguments: - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - geo: str; geo type name (county, msa, hrr, state) as in the CSV name + - sig: str; signal name as in the CSV name Returns: - None @@ -426,10 +456,9 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, si Arguments: - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - checking_date - - date_list: list of dates to check - - sig: str; signal name as in the CSV name + - checking_date: datetime date - geo: str; geo type name (county, msa, hrr, state) as in the CSV name + - sig: str; signal name as in the CSV name Returns: - None @@ -454,6 +483,8 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") + - geo: str; geo type name (county, msa, hrr, state) as in the CSV name + - sig: str; signal name as in the CSV name Returns: - None @@ -529,11 +560,6 @@ def validate(self, export_dir): Arguments: - export_dir: path to data CSVs - - start_date: beginning date of data to check - - end_date: end date of data to check - - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html - - params: dictionary of user settings; if empty, defaults will be used - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test Returns: - None @@ -541,17 +567,18 @@ def validate(self, export_dir): # Get relevant data file names and info. export_files = read_filenames(export_dir) date_filter = self.make_date_filter(self.start_date, self.end_date) + # List of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] + # Get list of just CSV names. filenames = [name_match_pair[0] for name_match_pair in validate_files] # Get all expected combinations of geo_type and signal. geo_sig_cmbo = get_geo_sig_cmbo(self.data_source) - self.check_missing_dates( - validate_files, self.start_date, self.end_date) - self.check_settings(self.max_check_lookbehind, self.generation_date) + self.check_missing_dates(validate_files) + self.check_settings() all_frames = [] @@ -563,9 +590,8 @@ def validate(self, export_dir): self.check_df_format(df, filename) self.check_bad_geo_id(df, filename, match.groupdict()['geo_type']) self.check_bad_val(df, filename, match.groupdict()['signal']) - self.check_bad_se(df, filename, self.missing_se_allowed) - self.check_bad_sample_size( - df, filename, self.minimum_sample_size, self.missing_sample_size_allowed) + self.check_bad_se(df, filename) + self.check_bad_sample_size(df, filename) # Get geo_type, date, and signal name as specified by CSV name. df['geo_type'] = match.groupdict()['geo_type'] @@ -594,9 +620,6 @@ def validate(self, export_dir): smooth_option_regex = re.compile(r'([^_]+)') - # TODO: Remove for actual version - kroc = 0 - # Comparison checks # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. @@ -614,10 +637,8 @@ def validate(self, export_dir): print(geo_sig_df) max_date = geo_sig_df["time_value"].max() - self.check_min_allowed_max_date( - max_date, self.generation_date, weight_option, geo, sig) - self.check_max_allowed_max_date( - max_date, self.generation_date, geo, sig) + self.check_min_allowed_max_date(max_date, weight_option, geo, sig) + self.check_max_allowed_max_date(max_date, geo, sig) # TODO: Check to see, if this date is in the API, if values have been updated and changed significantly. @@ -647,11 +668,6 @@ def validate(self, export_dir): self.check_avg_val_diffs( recent_df, reference_api_df, smooth_option, checking_date, geo, sig) - # TODO: Remove for actual version - kroc += 1 - if kroc == 2: - break - self.exit() def exit(self): From 79e80baa40b6f8fe20bcb0fd14b4a4772c5ffaf2 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 5 Oct 2020 14:39:00 -0400 Subject: [PATCH 067/151] Linting improvements --- validator/delphi_validator/datafetcher.py | 17 +- validator/delphi_validator/run.py | 1 - validator/delphi_validator/validate.py | 208 +++++++++++++--------- 3 files changed, 136 insertions(+), 90 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index d35daec91..7276fdc93 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -7,7 +7,6 @@ from os import listdir from os.path import isfile, join from datetime import datetime -from typing import List from itertools import product import pandas as pd import numpy as np @@ -21,7 +20,8 @@ def read_filenames(path): """ - Return a list of tuples of every filename and regex match to the CSV filename format in the specified directory. + Return a list of tuples of every filename and regex match to the CSV filename + format in the specified directory. Arguments: - path: path to the directory containing CSV data files. @@ -77,13 +77,15 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): Generator that assembles data within the specified date range for a given geo_sig_cmbo. Arguments: - - geo_sig_cmbo: list of geo type-signal type combinations that we expect to see, based on combinations reported available by Covidcast metadata + - geo_sig_cmbo: list of geo type-signal type combinations that we expect to see, + based on combinations reported available by Covidcast metadata - data_folder: path to the directory containing CSV data files. - filenames: list of filenames - date_slist: list of dates (formatted as strings) to check Returns: - - dataframe containing data for all dates in date_slist for a given geo type-signal type combination + - dataframe containing data for all dates in date_slist for a given + geo type-signal type combination - relevant geo type (str) - relevant signal type (str) """ @@ -116,13 +118,14 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): def fetch_api_reference(data_source, start_date, end_date, geo, sig): """ - Get and process API data for use as a reference. Formatting is changed to match that of source data CSVs. + Get and process API data for use as a reference. Formatting is changed + to match that of source data CSVs. """ api_df = covidcast.signal( data_source, sig, start_date, end_date, geo) if not isinstance(api_df, pd.DataFrame): - custom_msg = "Error fetching data from " + str(survey_date) + \ + custom_msg = "Error fetching data from " + str(start_date) + \ " to " + str(end_date) + \ "for data source: " + data_source + \ ", signal-type: " + sig + \ @@ -140,4 +143,4 @@ def fetch_api_reference(data_source, start_date, end_date, geo, sig): columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}).drop( ['direction', 'issue', 'lag'], axis=1).reindex(columns=column_names) - return(api_df) + return api_df diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index 0ec67641b..74371518b 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -4,7 +4,6 @@ This module should contain a function called `run_module`, that is executed when the module is run with `python -m delphi_validator`. """ -from datetime import datetime from delphi_utils import read_params from .validate import Validator diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 8e6f9210d..6eb63b102 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -9,12 +9,11 @@ from os.path import join from datetime import date, datetime, timedelta import pandas as pd -import numpy as np -import covidcast -from .datafetcher import * +from .datafetcher import filename_regex, \ + read_filenames, load_csv, get_geo_sig_cmbo, \ + read_geo_sig_cmbo_files, fetch_api_reference -import pdb # Recognized geo types. negated_regex_dict = { @@ -33,14 +32,57 @@ def reldiff_by_min(x, y): return (x - y) / min(x, y) +def make_date_filter(start_date, end_date): + """ + Create a function to return a boolean of whether a filename of appropriate + format contains a date within (inclusive) the specified date range. + + Arguments: + - start_date: datetime date object + - end_date: datetime date object + + Returns: + - Custom function object + """ + # Convert dates from datetime format to int. + start_code = int(start_date.strftime("%Y%m%d")) + end_code = int(end_date.strftime("%Y%m%d")) + + def f(match): + """ + Return a boolean of whether a filename of appropriate format contains a date + within the specified date range. + + Arguments: + - match: regex match object based on filename_regex applied to a filename str + + Returns: + - boolean + """ + # If regex match doesn't exist, current filename is not an appropriately + # formatted source data file. + if not match: + return False + + # Convert date found in CSV name to int. + code = int(match.groupdict()['date']) + + # Return boolean True if current file date "code" is within the defined date range. + return start_code <= code <= end_code + + return f + + class ValidationError(Exception): """ Error raised when validation check fails. """ def __init__(self, check_data_id, expression, message): """ Arguments: - - check_data_id: str or tuple/list of str uniquely identifying the check that was run and on what data - - expression: relevant variables to message, e.g., if a date doesn't pass a check, provide the date + - check_data_id: str or tuple/list of str uniquely identifying the + check that was run and on what data + - expression: relevant variables to message, e.g., if a date doesn't + pass a check, provide the date - message: str explaining why an error was raised """ self.check_data_id = (check_data_id,) if not isinstance( @@ -50,7 +92,8 @@ def __init__(self, check_data_id, expression, message): class Validator(): - """ Class containing validation() function and supporting functions. Stores a list of all raised errors, and user settings. """ + """ Class containing validation() function and supporting functions. Stores a list + of all raised errors, and user settings. """ def __init__(self, params): """ @@ -60,14 +103,19 @@ def __init__(self, params): - params: dictionary of user settings; if empty, defaults will be used Attributes: - - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html + - data_source: str; data source name, one of + https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html - start_date: beginning date of data to check, in datetime date format - end_date: end date of data to check, in datetime date format - - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test - - max_check_lookbehind: number of days back to perform sanity checks, starting from the last date appearing in df_to_test + - generation_date: date that this df_to_test was generated; typically 1 day + after the last date in df_to_test + - max_check_lookbehind: number of days back to perform sanity checks, starting + from the last date appearing in df_to_test - minimum_sample_size: int - - missing_se_allowed: boolean indicating if missing standard errors should raise an exception or not - - missing_sample_size_allowed: boolean indicating if missing sample size should raise an exception or not + - missing_se_allowed: boolean indicating if missing standard errors should + raise an exception or not + - missing_sample_size_allowed: boolean indicating if missing sample size should + raise an exception or not - sanity_check_rows_per_day: boolean; check flag - sanity_check_value_diffs: boolean; check flag - check_vs_working: boolean; check flag @@ -101,43 +149,6 @@ def __init__(self, params): self.raised_errors = [] - def make_date_filter(self, start_date, end_date): - """ - Create a function to return a boolean of whether a filename of appropriate format contains a date within (inclusive) the specified date range. - - Arguments: - - start_date: datetime date object - - end_date: datetime date object - - Returns: - - Custom function object - """ - # Convert dates from datetime format to int. - start_code = int(start_date.strftime("%Y%m%d")) - end_code = int(end_date.strftime("%Y%m%d")) - - def f(match): - """ - Return a boolean of whether a filename of appropriate format contains a date within the specified date range. - - Arguments: - - match: regex match object based on filename_regex applied to a filename str - - Returns: - - boolean - """ - # If regex match doesn't exist, current filename is not an appropriately formatted source data file. - if not match: - return False - - # Convert date found in CSV name to int. - code = int(match.groupdict()['date']) - - # Return boolean True if current file date "code" is within the defined date range. - return start_code <= code <= end_code - - return f - def check_missing_dates(self, daily_filenames): """ Check for missing dates between the specified start and end dates. @@ -199,7 +210,8 @@ def check_df_format(self, df_to_test, nameformat): Check basic format of source data CSV df. Arguments: - - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - df_to_test: pandas dataframe of a single CSV of source data + (one day-signal-geo_type combo) - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" Returns: @@ -234,7 +246,8 @@ def check_bad_geo_id(self, df_to_test, nameformat, geo_type): def find_all_unexpected_geo_ids(df_to_test, negated_regex): """ - Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. + Check if any geo_ids in df_to_test aren't formatted correctly, according + to the geo type dictionary negated_regex_dict. """ unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall( negated_regex) if len(ugeo) > 0] @@ -290,7 +303,8 @@ def check_bad_se(self, df_to_test, nameformat): Check standard errors for validity. Arguments: - - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - df_to_test: pandas dataframe of a single CSV of source data + (one day-signal-geo_type combo) - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" Returns: @@ -338,7 +352,9 @@ def check_bad_se(self, df_to_test, nameformat): if not result_jeffreys.empty: self.raised_errors.append(ValidationError( ("check_se_0_when_val_0", nameformat), - None, "when signal value is 0, se must be non-zero. please use Jeffreys correction to generate an appropriate se")) + None, + "when signal value is 0, se must be non-zero. please " + + "use Jeffreys correction to generate an appropriate se")) elif not result_alt.empty: self.raised_errors.append(ValidationError( ("check_se_0", nameformat), @@ -349,7 +365,8 @@ def check_bad_sample_size(self, df_to_test, nameformat): Check sample sizes for validity. Arguments: - - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) + - df_to_test: pandas dataframe of a single CSV of source data + (one day-signal-geo_type combo) - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" Returns: @@ -430,8 +447,10 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date Check if reference data is more recent than test data. Arguments: - - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - df_to_test: pandas dataframe of a single CSV of source data + (one day-signal-geo_type combo) + - df_to_reference: pandas dataframe of reference data, either from the + COVIDcast API or semirecent data - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - sig: str; signal name as in the CSV name @@ -455,7 +474,8 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, si Arguments: - df_to_test: pandas dataframe of CSV source data - - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data + - df_to_reference: pandas dataframe of reference data, either from the + COVIDcast API or semirecent data - checking_date: datetime date - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - sig: str; signal name as in the CSV name @@ -468,28 +488,37 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, si reference_rows_per_reporting_day = df_to_reference.shape[0] / len( set(df_to_reference["time_value"])) - if abs(reldiff_by_min(test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: + if abs(reldiff_by_min( + test_rows_per_reporting_day, + reference_rows_per_reporting_day)) > 0.35: self.raised_errors.append(ValidationError( ("check_rapid_change_num_rows", checking_date.date(), geo, sig), (test_rows_per_reporting_day, reference_rows_per_reporting_day), "Number of rows per day (-with-any-rows) seems to have changed " + "rapidly (reference vs test data)")) - def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checking_date, geo, sig): + def check_avg_val_diffs(self, + df_to_test, df_to_reference, + smooth_option, + checking_date, + geo, sig): """ Compare average values for each variable in test dataframe vs reference dataframe. Arguments: - df_to_test: pandas dataframe of CSV source data - - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal (e.g. 7dav is "smoothed") + - df_to_reference: pandas dataframe of reference data, either from the + COVIDcast API or semirecent data + - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal + (e.g. 7dav is "smoothed") - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - sig: str; signal name as in the CSV name Returns: - None """ - # Average each of val, se, and sample_size over all dates for a given geo_id. Ignores NA by default. + # Average each of val, se, and sample_size over all dates for a given geo_id. + # Ignores NA by default. df_to_test = df_to_test.groupby(['geo_id'], as_index=False)[ ['val', 'se', 'sample_size']].mean() df_to_test["type"] = "test" @@ -500,7 +529,9 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki df_all = pd.concat([df_to_test, df_to_reference]) - # For each variable (val, se, and sample size) where not missing, calculate the relative mean difference and mean absolute difference between the test data and the reference data across all geographic regions. + # For each variable (val, se, and sample size) where not missing, calculate the + # relative mean difference and mean absolute difference between the test data + # and the reference data across all geographic regions. df_all = pd.melt( df_all, id_vars=["geo_id", "type"], value_vars=["val", "se", "sample_size"] ).pivot(index=("geo_id", "variable"), columns="type", values="value" @@ -538,21 +569,21 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, smooth_option, checki # Check if the calculated mean differences are high compared to the thresholds. mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ( - (df_all["variable"] == "val").bool() and (abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) + (df_all["variable"] == "val").bool() and ( + abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) mean_stdabsdiff_high = ( df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() - flag = mean_stddiff_high or mean_stdabsdiff_high - - if flag: + if mean_stddiff_high or mean_stdabsdiff_high: self.raised_errors.append(ValidationError( ("check_test_vs_reference_avg_changed", checking_date.date(), geo, sig), (mean_stddiff_high, mean_stdabsdiff_high), - 'Average differences in variables by geo_id between recent & reference data (either semirecent or from API) seem' - + 'large --- either large increase tending toward one direction or large mean absolute' - + 'difference, relative to average values of corresponding variables. For the former' - + 'check, tolerances for `val` are more restrictive than those for other columns.')) + 'Average differences in variables by geo_id between recent & reference data ' + + '(either semirecent or from API) seem large --- either large increase ' + + 'tending toward one direction or large mean absolute difference, relative ' + + 'to average values of corresponding variables. For the former check, ' + + 'tolerances for `val` are more restrictive than those for other columns.')) def validate(self, export_dir): """ @@ -566,14 +597,11 @@ def validate(self, export_dir): """ # Get relevant data file names and info. export_files = read_filenames(export_dir) - date_filter = self.make_date_filter(self.start_date, self.end_date) + date_filter = make_date_filter(self.start_date, self.end_date) - # List of tuples of CSV names and regex match objects. + # Make list of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] - # Get list of just CSV names. - filenames = [name_match_pair[0] for name_match_pair in validate_files] - # Get all expected combinations of geo_type and signal. geo_sig_cmbo = get_geo_sig_cmbo(self.data_source) @@ -620,10 +648,19 @@ def validate(self, export_dir): smooth_option_regex = re.compile(r'([^_]+)') + # TODO: Remove for actual version + kroc = 0 + + # TODO: Improve efficiency by grouping all_frames by geo and sig instead + # of reading data in again via read_geo_sig_cmbo_files(). + # Comparison checks - # TODO: Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. - for geo_sig_df, geo, sig in read_geo_sig_cmbo_files(geo_sig_cmbo, export_dir, filenames, date_slist): + for geo_sig_df, geo, sig in read_geo_sig_cmbo_files( + geo_sig_cmbo, + export_dir, + [name_match_pair[0] for name_match_pair in validate_files], + date_slist): m = smooth_option_regex.match(sig) smooth_option = m.group(1) @@ -640,11 +677,13 @@ def validate(self, export_dir): self.check_min_allowed_max_date(max_date, weight_option, geo, sig) self.check_max_allowed_max_date(max_date, geo, sig) - # TODO: Check to see, if this date is in the API, if values have been updated and changed significantly. + # TODO: Check to see, if this date is in the API, if values have been updated + # and changed significantly. # TODO: Compare data against long-ago (3 months?) API data for changes in trends. - # Check data from a group of dates against recent (previous 7 days, by default) data from the API. + # Check data from a group of dates against recent (previous 7 days, by default) + # data from the API. for checking_date in date_list: recent_cutoff_date = checking_date - recent_lookbehind recent_df = geo_sig_df.query( @@ -668,6 +707,11 @@ def validate(self, export_dir): self.check_avg_val_diffs( recent_df, reference_api_df, smooth_option, checking_date, geo, sig) + # TODO: Remove for actual version + kroc += 1 + if kroc == 2: + break + self.exit() def exit(self): @@ -678,9 +722,9 @@ def exit(self): suppressed_counter = 0 subset_raised_errors = [] - for val_error in set(self.raised_errors): + for val_error in self.raised_errors: raised_check_id = tuple(item.strftime("%Y-%m-%d") if isinstance( - item, date) or isinstance(item, datetime) else item for item in val_error.check_data_id) + item, (date, datetime)) else item for item in val_error.check_data_id) if raised_check_id not in self.suppressed_errors: subset_raised_errors.append(val_error) From bfb3408643263c27ebdbe60cb963c5e804f176d4 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Oct 2020 14:28:31 -0400 Subject: [PATCH 068/151] README, PLANS, tests updates. Moved ValidationError to errors.py --- validator/PLANS.md | 12 +- validator/README.md | 14 +- validator/delphi_validator/errors.py | 22 +++- validator/delphi_validator/validate.py | 19 +-- validator/tests/test_checks.py | 176 ++++++++++++++++++++++++- 5 files changed, 208 insertions(+), 35 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 4a548a4ad..d871de82b 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -2,6 +2,7 @@ ## Current checks for indicator source data +* Missing dates within the selected range * Appropriate file name * Recognized geographical type (county, state, etc) * Recognized geo id format (e.g. state is two lowercase letters) @@ -15,22 +16,21 @@ * If signal and stderr both = 0 (seen in Quidel data due to lack of Jeffreys correction, [issue 255](https://github.com/cmu-delphi/covidcast-indicators/issues/255#issuecomment-692196541)) * Missing ‘sample_size’ values * Appropriate ‘sample_size’ values, ≥ 100 (default) or user-defined threshold -* Similar number of obs per day as recent API data -* Similar average values as API data -* Missing dates within the selected range +* Similar number of obs per day as recent API data (static threshold) +* Similar average values as API data (static threshold) ## Current features -* Errors are summarized in class attribute +* Errors are summarized in class attribute and printed on exit * Various check settings are controllable via indicator-specific params.json files +* User can manually disable certain checks for certain sets of data using a field in the params.json file ## Checks + features wishlist, and problems to think about: * check for large jumps * Which, if any, specific geo_ids are missing * different thresholds for different files? -* flags to disable certain checks -* re-adjust thresholds to not pass for some previous days that did have issues +* use known erroneous/anomalous days of source data to re-adjust thresholds to not pass * check number of observations * tests * check for duplicate rows diff --git a/validator/README.md b/validator/README.md index 9d827d836..d12a0f36d 100644 --- a/validator/README.md +++ b/validator/README.md @@ -83,13 +83,19 @@ The output will show the number of unit tests that passed and failed, along with ## Code tour * run.py: sends params.json fields to and runs the validation process -* datafetcher.py: methods for loading source data -* validate.py: methods for validating source data. Includes the individual check functions. -* errors.py: custom validation errors +* datafetcher.py: methods for loading source and API data +* validate.py: methods for validating data. Includes the individual check methods and supporting functions. +* errors.py: custom errors ## Adding checks To add a new validation check, define the check as a `Validator` class method in `validate.py`. Each check should append a descriptive error message to the `raised` attribute if triggered. All checks should allow the user to override exception raising for a specific file using the `exception_override` setting in `params.json`. -Add the newly defined check to the `validate()` method to be executed. It should go in one of two sections: data sanity checks where a data file is compared against static format settings, or data trend and value checks where a set of data is compared against API data. \ No newline at end of file +This features requires that the `check_data_id` defined for an error uniquely identifies that combination of check and test data. This usually takes the form of a tuple of strings with the check method and test identifier, and test data filename or date, geo type, and signal name. + +Add the newly defined check to the `validate()` method to be executed. It should go in one of three sections: + +* data sanity checks where a data file is compared against static format settings, +* data trend and value checks where a set of data is compared against recent API data, from the previous few days, +* data trend and value checks where a set of data is compared against long term API data, from a few months ago \ No newline at end of file diff --git a/validator/delphi_validator/errors.py b/validator/delphi_validator/errors.py index 7ed08db36..b5ed9e69f 100644 --- a/validator/delphi_validator/errors.py +++ b/validator/delphi_validator/errors.py @@ -3,8 +3,9 @@ Custom validator exceptions. """ + class APIDataFetchError(Exception): - """Exception raised for errors during validation. + """Exception raised when reading API data goes wrong. Attributes: custom_msg -- parameters which caused the error @@ -16,3 +17,22 @@ def __init__(self, custom_msg): def __str__(self): return '{}'.format(self.custom_msg) + + +class ValidationError(Exception): + """ Error raised when validation check fails. """ + + def __init__(self, check_data_id, expression, message): + """ + Arguments: + - check_data_id: str or tuple/list of str uniquely identifying the + check that was run and on what data + - expression: relevant variables to message, e.g., if a date doesn't + pass a check, provide the date + - message: str explaining why an error was raised + """ + self.check_data_id = (check_data_id,) if not isinstance( + check_data_id, tuple) and not isinstance(check_data_id, list) else tuple(check_data_id) + self.expression = expression + self.message = message + super().__init__(self.message) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 6eb63b102..a91d2430d 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -10,6 +10,7 @@ from datetime import date, datetime, timedelta import pandas as pd +from .errors import ValidationError from .datafetcher import filename_regex, \ read_filenames, load_csv, get_geo_sig_cmbo, \ read_geo_sig_cmbo_files, fetch_api_reference @@ -73,24 +74,6 @@ def f(match): return f -class ValidationError(Exception): - """ Error raised when validation check fails. """ - - def __init__(self, check_data_id, expression, message): - """ - Arguments: - - check_data_id: str or tuple/list of str uniquely identifying the - check that was run and on what data - - expression: relevant variables to message, e.g., if a date doesn't - pass a check, provide the date - - message: str explaining why an error was raised - """ - self.check_data_id = (check_data_id,) if not isinstance( - check_data_id, tuple) and not isinstance(check_data_id, list) else tuple(check_data_id) - self.expression = expression - self.message = message - - class Validator(): """ Class containing validation() function and supporting functions. Stores a list of all raised errors, and user settings. """ diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 932cacaf9..941e001ea 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -1,18 +1,182 @@ import pytest +from datetime import date, datetime, timedelta import pandas as pd +from delphi_validator.datafetcher import filename_regex +import delphi_validator.validate from delphi_validator.validate import Validator + # # Define constants. # PARAMS = read_params() # DATA_FILEPATH = PARAMS["input_file"] -class TestCheckBadVal: - validator = Validator() +class TestDateFilter: + + def test_same_day_filter(self): + start_date = end_date = datetime.strptime("20200902", "%Y%m%d") + date_filter = delphi_validator.validate.make_date_filter( + start_date, end_date) + + filenames = [(f, filename_regex.match(f)) + for f in ("20200901_county_signal_signal.csv", + "20200902_county_signal_signal.csv", + "20200903_county_signal_signal.csv")] + + subset_filenames = [(f, m) for (f, m) in filenames if date_filter(m)] + + assert len(subset_filenames) == 1 + assert subset_filenames[0] == "20200902_county_signal_signal.csv" + + +# class TestValidatorInitialization: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckMissingDates: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckSettings: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckDfFormat: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckBadGeoId: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckBadVal: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckBadSe: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckBadN: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckMinDate: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckMaxDate: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckMaxReferenceDate: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckRapidChange: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# class TestCheckAvgValDiffs: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# # How? +# class TestValidate: + +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") + +# assert len(self.validator.raised) == 0 + + +# # How? +# class TestExit: - def test_empty_df(self): - empty_df = pd.DataFrame(columns=["val"]) - self.validator.check_bad_val(empty_df, "") +# def test_empty_df(self): +# validator = Validator() +# empty_df = pd.DataFrame(columns=["val"]) +# self.validator.check_bad_val(empty_df, "") - assert len(self.validator.raised) == 0 +# assert len(self.validator.raised) == 0 From d2162c6afd1653eb11f929ba94ead3da89110753 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Oct 2020 22:03:41 -0400 Subject: [PATCH 069/151] unit tests --- validator/delphi_validator/validate.py | 8 +- validator/tests/test_checks.py | 166 +++++++++++++++++-------- 2 files changed, 118 insertions(+), 56 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index a91d2430d..c8135d759 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -101,7 +101,6 @@ def __init__(self, params): raise an exception or not - sanity_check_rows_per_day: boolean; check flag - sanity_check_value_diffs: boolean; check flag - - check_vs_working: boolean; check flag - suppressed_errors: set of check_data_ids used to identify error messages to ignore - raised_errors: list to append errors to as they are raised """ @@ -111,7 +110,7 @@ def __init__(self, params): datetime.strptime(params['start_date'], '%Y-%m-%d')) self.end_date = datetime.date( datetime.strptime(params['end_date'], '%Y-%m-%d')) - self.generation_date = params.get('generation_date', date.today()) + self.generation_date = date.today() self.max_check_lookbehind = timedelta( days=params.get("ref_window_size", 7)) @@ -124,8 +123,6 @@ def __init__(self, params): 'sanity_check_rows_per_day', True) self.sanity_check_value_diffs = params.get( 'sanity_check_value_diffs', True) - # TODO: use for something... See https://github.com/cmu-delphi/covid-19/blob/fb-survey/facebook/prepare-extracts/covidalert-io-funs.R#L439 - self.check_vs_working = params.get('check_vs_working', True) self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( item, list) else tuple(item) for item in params.get('suppressed_errors', [])} @@ -137,7 +134,8 @@ def check_missing_dates(self, daily_filenames): Check for missing dates between the specified start and end dates. Arguments: - - daily_filenames: list of CSV source data filenames. + - daily_filenames: list of tuples, each containing CSV source data filename + and the regex match object corresponding to filename_regex. Returns: - None diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 941e001ea..ff4299edf 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -3,8 +3,9 @@ import pandas as pd from delphi_validator.datafetcher import filename_regex -import delphi_validator.validate -from delphi_validator.validate import Validator +from delphi_validator.validate import Validator, make_date_filter + +import pdb # # Define constants. @@ -14,9 +15,9 @@ class TestDateFilter: - def test_same_day_filter(self): + def test_same_day(self): start_date = end_date = datetime.strptime("20200902", "%Y%m%d") - date_filter = delphi_validator.validate.make_date_filter( + date_filter = make_date_filter( start_date, end_date) filenames = [(f, filename_regex.match(f)) @@ -27,156 +28,219 @@ def test_same_day_filter(self): subset_filenames = [(f, m) for (f, m) in filenames if date_filter(m)] assert len(subset_filenames) == 1 - assert subset_filenames[0] == "20200902_county_signal_signal.csv" + assert subset_filenames[0][0] == "20200902_county_signal_signal.csv" + def test_inclusive(self): + start_date = datetime.strptime("20200902", "%Y%m%d") + end_date = datetime.strptime("20200903", "%Y%m%d") + date_filter = make_date_filter( + start_date, end_date) -# class TestValidatorInitialization: + filenames = [(f, filename_regex.match(f)) + for f in ("20200901_county_signal_signal.csv", + "20200902_county_signal_signal.csv", + "20200903_county_signal_signal.csv", + "20200904_county_signal_signal.csv")] -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + subset_filenames = [(f, m) for (f, m) in filenames if date_filter(m)] -# assert len(self.validator.raised) == 0 + assert len(subset_filenames) == 2 + def test_empty(self): + start_date = datetime.strptime("20200902", "%Y%m%d") + end_date = datetime.strptime("20200903", "%Y%m%d") + date_filter = make_date_filter( + start_date, end_date) -# class TestCheckMissingDates: + filenames = [(f, filename_regex.match(f)) + for f in ()] -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + subset_filenames = [(f, m) for (f, m) in filenames if date_filter(m)] -# assert len(self.validator.raised) == 0 + assert len(subset_filenames) == 0 -# class TestCheckSettings: +class TestValidatorInitialization: -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_default_settings(self): + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-01"} + validator = Validator(params) -# assert len(self.validator.raised) == 0 + assert validator.max_check_lookbehind == timedelta(days=7) + assert validator.minimum_sample_size == 100 + assert validator.missing_se_allowed == False + assert validator.missing_sample_size_allowed == False + assert validator.sanity_check_rows_per_day == True + assert validator.sanity_check_value_diffs == True + assert len(validator.suppressed_errors) == 0 + assert isinstance(validator.suppressed_errors, set) + assert len(validator.raised_errors) == 0 -# class TestCheckDfFormat: +class TestCheckMissingDates: -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_empty_filelist(self): + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-09"} + validator = Validator(params) + + filenames = list() + validator.check_missing_dates(filenames) + + assert len(validator.raised_errors) == 1 + assert "check_missing_date_files" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert len(validator.raised_errors[0].expression) == 9 + + def test_same_day(self): + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-01"} + validator = Validator(params) + + filenames = [("20200901_county_signal_signal.csv", "match_obj")] + validator.check_missing_dates(filenames) + + assert len(validator.raised_errors) == 0 + assert "check_missing_date_files" not in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_duplicate_dates(self): + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} + validator = Validator(params) + + filenames = [("20200901_county_signal_signal.csv", "match_obj"), + ("20200903_county_signal_signal.csv", "match_obj"), + ("20200903_usa_signal_signal.csv", "match_obj"), + ("20200903_usa_signal_signal.csv", "match_obj")] + validator.check_missing_dates(filenames) + + assert len(validator.raised_errors) == 1 + assert "check_missing_date_files" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert len([err.expression[0] for + err in validator.raised_errors if err.check_data_id[0] == + "check_missing_date_files"]) == 1 + assert [err.expression[0] for + err in validator.raised_errors if err.check_data_id[0] == + "check_missing_date_files"][0] == datetime.strptime("20200902", "%Y%m%d").date() + + +class TestNameFormat: -# assert len(self.validator.raised) == 0 + def test_empty_df(self): + pattern_found = filename_regex.match("20200903_usa_signal_signal.csv") + pdb.set_trace() # class TestCheckBadGeoId: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckBadVal: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckBadSe: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckBadN: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckMinDate: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckMaxDate: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckMaxReferenceDate: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckRapidChange: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # class TestCheckAvgValDiffs: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # # How? # class TestValidate: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 # # How? # class TestExit: # def test_empty_df(self): -# validator = Validator() +# validator = Validator() # empty_df = pd.DataFrame(columns=["val"]) # self.validator.check_bad_val(empty_df, "") -# assert len(self.validator.raised) == 0 +# assert len(self.validator.raised_errors) == 0 From 93ff163c2d841901798721433e1aae119a9ab1b3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Oct 2020 13:05:09 -0400 Subject: [PATCH 070/151] [wip] more tests. regex for catching wrong geo_ids is not sensitive enough --- validator/delphi_validator/validate.py | 17 ++-- validator/tests/test_checks.py | 121 +++++++++++++++++++++---- 2 files changed, 111 insertions(+), 27 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index c8135d759..b8797aa24 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -220,11 +220,6 @@ def check_bad_geo_id(self, df_to_test, nameformat, geo_type): Returns: - None """ - if geo_type not in negated_regex_dict: - self.raised_errors.append(ValidationError( - ("check_geo_type", nameformat), - geo_type, "Unrecognized geo type")) - def find_all_unexpected_geo_ids(df_to_test, negated_regex): """ Check if any geo_ids in df_to_test aren't formatted correctly, according @@ -235,9 +230,15 @@ def find_all_unexpected_geo_ids(df_to_test, negated_regex): if len(unexpected_geos) > 0: self.raised_errors.append(ValidationError( ("check_geo_id_format", nameformat), - unexpected_geos, "Non-conforming geo_ids found")) + set(unexpected_geos), "Non-conforming geo_ids found")) - find_all_unexpected_geo_ids(df_to_test, negated_regex_dict[geo_type]) + if geo_type not in negated_regex_dict: + self.raised_errors.append(ValidationError( + ("check_geo_type", nameformat), + geo_type, "Unrecognized geo type")) + else: + find_all_unexpected_geo_ids( + df_to_test, negated_regex_dict[geo_type]) def check_bad_val(self, df_to_test, nameformat, signal_type): """ @@ -252,7 +253,7 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): """ # Determine if signal is a proportion or percent percent_option = bool('pct' in signal_type) - proportion_option = bool('pct' in signal_type) + proportion_option = bool('prop' in signal_type) if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index ff4299edf..b25ee0b1b 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -8,11 +8,6 @@ import pdb -# # Define constants. -# PARAMS = read_params() -# DATA_FILEPATH = PARAMS["input_file"] - - class TestDateFilter: def test_same_day(self): @@ -129,29 +124,117 @@ def test_duplicate_dates(self): class TestNameFormat: - def test_empty_df(self): + def test_match_existence(self): pattern_found = filename_regex.match("20200903_usa_signal_signal.csv") - pdb.set_trace() + assert pattern_found + pattern_found = filename_regex.match("2020090_usa_signal_signal.csv") + assert not pattern_found -# class TestCheckBadGeoId: + pattern_found = filename_regex.match("20200903_usa_signal_signal.pdf") + assert not pattern_found -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + pattern_found = filename_regex.match("20200903_usa_.csv") + assert not pattern_found -# assert len(self.validator.raised_errors) == 0 + def test_expected_groups(self): + pattern_found = filename_regex.match( + "20200903_usa_signal_signal.csv").groupdict() + assert pattern_found["date"] == "20200903" + assert pattern_found["geo_type"] == "usa" + assert pattern_found["signal"] == "signal_signal" -# class TestCheckBadVal: +class TestCheckBadGeoId: + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_empty_df(self): + validator = Validator(self.params) + empty_df = pd.DataFrame(columns=["geo_id"]) + validator.check_bad_geo_id(empty_df, "name", "county") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 0 + + def test_invalid_geo_type(self): + validator = Validator(self.params) + empty_df = pd.DataFrame(columns=["geo_id"]) + validator.check_bad_geo_id(empty_df, "name", "hello") + + assert len(validator.raised_errors) == 1 + assert "check_geo_type" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert [err.expression for + err in validator.raised_errors if err.check_data_id[0] == + "check_geo_type"][0] == "hello" + + def test_invalid_geo_id_county(self): + validator = Validator(self.params) + df = pd.DataFrame(["0", "54321", "123", ".0000", + "abc12"], columns=["geo_id"]) + validator.check_bad_geo_id(df, "name", "county") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_format" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 4 + assert "54321" not in validator.raised_errors[0].expression + + def test_invalid_geo_id_msa(self): + validator = Validator(self.params) + df = pd.DataFrame(["0", "54321", "123", ".0000", + "abc12"], columns=["geo_id"]) + validator.check_bad_geo_id(df, "name", "msa") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_format" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 4 + assert "54321" not in validator.raised_errors[0].expression + + def test_invalid_geo_id_hrr(self): + validator = Validator(self.params) + df = pd.DataFrame(["1", "12", "123", "1234", "12345", + "a", ".", "ab1"], columns=["geo_id"]) + validator.check_bad_geo_id(df, "name", "hrr") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_format" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 5 + assert "1" not in validator.raised_errors[0].expression + assert "12" not in validator.raised_errors[0].expression + assert "123" not in validator.raised_errors[0].expression + + def test_invalid_geo_id_state(self): + validator = Validator(self.params) + df = pd.DataFrame(["aa", "hi", "HI", "hawaii", + "Hawaii", "a", "H.I."], columns=["geo_id"]) + validator.check_bad_geo_id(df, "name", "state") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_format" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 5 + assert "aa" not in validator.raised_errors[0].expression + assert "hi" not in validator.raised_errors[0].expression + + def test_invalid_geo_id_national(self): + validator = Validator(self.params) + df = pd.DataFrame(["usa", "USA", " usa", "us", + "usausa", "America"], columns=["geo_id"]) + validator.check_bad_geo_id(df, "name", "national") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_format" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 5 + assert "usa" not in validator.raised_errors[0].expression + + +class TestCheckBadVal: + + def test_empty_df(self): + validator = Validator() + empty_df = pd.DataFrame(columns=["val"]) + self.validator.check_bad_val(empty_df, "") + + assert len(self.validator.raised_errors) == 0 # class TestCheckBadSe: From 9a0e8dc7d7edba21be02c5bcc82a4a5df9526811 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Oct 2020 15:40:33 -0400 Subject: [PATCH 071/151] sample size and se tests --- validator/delphi_validator/validate.py | 13 +- validator/tests/test_checks.py | 194 ++++++++++++++++++++----- 2 files changed, 164 insertions(+), 43 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index b8797aa24..a86e4159b 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -251,7 +251,7 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): Returns: - None """ - # Determine if signal is a proportion or percent + # Determine if signal is a proportion (# of x out of 100k people) or percent percent_option = bool('pct' in signal_type) proportion_option = bool('prop' in signal_type) @@ -267,7 +267,7 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): self.raised_errors.append(ValidationError( ("check_val_prop_gt_100k", nameformat), df_to_test[(df_to_test['val'] > 100000)], - "val column can't have any cell greater than 100000 for nameformat")) + "val column can't have any cell greater than 100000 for proportions")) if df_to_test['val'].isnull().values.any(): self.raised_errors.append(ValidationError( @@ -300,19 +300,14 @@ def check_bad_se(self, df_to_test, nameformat): df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3) if not self.missing_se_allowed: - if df_to_test['se'].isnull().values.any(): - self.raised_errors.append(ValidationError( - ("check_se_missing", nameformat), - None, "se must not be NA")) - # Find rows not in the allowed range for se. result = df_to_test.query( '~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: self.raised_errors.append(ValidationError( - ("check_se_in_range", nameformat), - result, "se must be in (0, min(50,val*(1+eps))]")) + ("check_se_not_missing_and_in_range", nameformat), + result, "se must be in (0, min(50,val*(1+eps))] and not missing")) if df_to_test["se"].isnull().mean() > 0.5: self.raised_errors.append(ValidationError( diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index b25ee0b1b..68687f27b 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -1,5 +1,6 @@ import pytest from datetime import date, datetime, timedelta +import numpy as np import pandas as pd from delphi_validator.datafetcher import filename_regex @@ -228,56 +229,183 @@ def test_invalid_geo_id_national(self): class TestCheckBadVal: + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} def test_empty_df(self): - validator = Validator() + validator = Validator(self.params) empty_df = pd.DataFrame(columns=["val"]) - self.validator.check_bad_val(empty_df, "") + validator.check_bad_val(empty_df, "", "") + validator.check_bad_val(empty_df, "", "prop") + validator.check_bad_val(empty_df, "", "pct") - assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 0 + def test_missing(self): + validator = Validator(self.params) + df = pd.DataFrame([np.nan], columns=["val"]) + validator.check_bad_val(df, "name", "signal") -# class TestCheckBadSe: + assert len(validator.raised_errors) == 1 + assert "check_val_missing" in validator.raised_errors[0].check_data_id -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_lt_0(self): + validator = Validator(self.params) + df = pd.DataFrame([-5], columns=["val"]) + validator.check_bad_val(df, "name", "signal") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 1 + assert "check_val_lt_0" in validator.raised_errors[0].check_data_id + def test_gt_max_pct(self): + validator = Validator(self.params) + df = pd.DataFrame([1e7], columns=["val"]) + validator.check_bad_val(df, "name", "pct") -# class TestCheckBadN: + assert len(validator.raised_errors) == 1 + assert "check_val_pct_gt_100" in validator.raised_errors[0].check_data_id -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_gt_max_prop(self): + validator = Validator(self.params) + df = pd.DataFrame([1e7], columns=["val"]) + validator.check_bad_val(df, "name", "prop") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 1 + assert "check_val_prop_gt_100k" in validator.raised_errors[0].check_data_id -# class TestCheckMinDate: +class TestCheckBadSe: + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_empty_df(self): + validator = Validator(self.params) + empty_df = pd.DataFrame( + columns=["val", "se", "sample_size"], dtype=float) + validator.check_bad_se(empty_df, "") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 0 + validator.missing_se_allowed = True + validator.check_bad_se(empty_df, "") -# class TestCheckMaxDate: + assert len(validator.raised_errors) == 0 -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_missing(self): + validator = Validator(self.params) + validator.missing_se_allowed = True + df = pd.DataFrame([[np.nan, np.nan, np.nan]], columns=[ + "val", "se", "sample_size"]) + validator.check_bad_se(df, "name") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 0 + validator.missing_se_allowed = False + validator.check_bad_se(df, "name") -# class TestCheckMaxReferenceDate: + assert len(validator.raised_errors) == 2 + assert "check_se_not_missing_and_in_range" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert "check_se_many_missing" in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_e_0_missing_allowed(self): + validator = Validator(self.params) + validator.missing_se_allowed = True + df = pd.DataFrame([[1, 0, 200], [1, np.nan, np.nan], [ + 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) + validator.check_bad_se(df, "name") + + assert len(validator.raised_errors) == 2 + assert "check_se_missing_or_in_range" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert "check_se_0" in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_e_0_missing_not_allowed(self): + validator = Validator(self.params) + validator.missing_se_allowed = False + df = pd.DataFrame([[1, 0, 200], [1, 0, np.nan], [ + 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) + validator.check_bad_se(df, "name") + + assert len(validator.raised_errors) == 2 + assert "check_se_not_missing_and_in_range" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert "check_se_0" in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_jeffreys(self): + validator = Validator(self.params) + validator.missing_se_allowed = False + df = pd.DataFrame([[0, 0, 200], [1, 0, np.nan], [ + 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) + validator.check_bad_se(df, "name") + + assert len(validator.raised_errors) == 2 + assert "check_se_not_missing_and_in_range" in [ + err.check_data_id[0] for err in validator.raised_errors] + assert "check_se_0_when_val_0" in [ + err.check_data_id[0] for err in validator.raised_errors] + + +class TestCheckBadN: + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} + + def test_empty_df(self): + validator = Validator(self.params) + empty_df = pd.DataFrame( + columns=["val", "se", "sample_size"], dtype=float) + validator.check_bad_sample_size(empty_df, "") + + assert len(validator.raised_errors) == 0 + + validator.missing_sample_size_allowed = True + validator.check_bad_sample_size(empty_df, "") + + assert len(validator.raised_errors) == 0 + + def test_missing(self): + validator = Validator(self.params) + validator.missing_sample_size_allowed = True + df = pd.DataFrame([[np.nan, np.nan, np.nan]], columns=[ + "val", "se", "sample_size"]) + validator.check_bad_sample_size(df, "name") + + assert len(validator.raised_errors) == 0 + + validator.missing_sample_size_allowed = False + validator.check_bad_sample_size(df, "name") + + assert len(validator.raised_errors) == 1 + assert "check_n_missing" in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_lt_min_missing_allowed(self): + validator = Validator(self.params) + validator.missing_sample_size_allowed = True + df = pd.DataFrame([[1, 0, 10], [1, np.nan, np.nan], [ + 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) + validator.check_bad_sample_size(df, "name") + + assert len(validator.raised_errors) == 1 + assert "check_n_missing_or_gt_min" in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_lt_min_missing_not_allowed(self): + validator = Validator(self.params) + validator.missing_sample_size_allowed = False + df = pd.DataFrame([[1, 0, 10], [1, np.nan, 240], [ + 1, np.nan, 245]], columns=["val", "se", "sample_size"]) + validator.check_bad_sample_size(df, "name") + + assert len(validator.raised_errors) == 1 + assert "check_n_gt_min" in [ + err.check_data_id[0] for err in validator.raised_errors] + + +# class TestCheckMinDate: # def test_empty_df(self): # validator = Validator() @@ -287,7 +415,7 @@ def test_empty_df(self): # assert len(self.validator.raised_errors) == 0 -# class TestCheckRapidChange: +# class TestCheckMaxDate: # def test_empty_df(self): # validator = Validator() @@ -297,7 +425,7 @@ def test_empty_df(self): # assert len(self.validator.raised_errors) == 0 -# class TestCheckAvgValDiffs: +# class TestCheckMaxReferenceDate: # def test_empty_df(self): # validator = Validator() @@ -307,8 +435,7 @@ def test_empty_df(self): # assert len(self.validator.raised_errors) == 0 -# # How? -# class TestValidate: +# class TestCheckRapidChange: # def test_empty_df(self): # validator = Validator() @@ -318,8 +445,7 @@ def test_empty_df(self): # assert len(self.validator.raised_errors) == 0 -# # How? -# class TestExit: +# class TestCheckAvgValDiffs: # def test_empty_df(self): # validator = Validator() From 040a98e1e972713acd28f42528c12b56900500e2 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Oct 2020 17:07:02 -0400 Subject: [PATCH 072/151] rapid change and avg value comparison tests --- validator/delphi_validator/validate.py | 4 +- validator/tests/test_checks.py | 134 +++++++++++++++++++------ 2 files changed, 106 insertions(+), 32 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index a86e4159b..f14be57c0 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -469,7 +469,7 @@ def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, si test_rows_per_reporting_day, reference_rows_per_reporting_day)) > 0.35: self.raised_errors.append(ValidationError( - ("check_rapid_change_num_rows", checking_date.date(), geo, sig), + ("check_rapid_change_num_rows", checking_date, geo, sig), (test_rows_per_reporting_day, reference_rows_per_reporting_day), "Number of rows per day (-with-any-rows) seems to have changed " + "rapidly (reference vs test data)")) @@ -532,7 +532,7 @@ def check_avg_val_diffs(self, # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] - raw_thresholds = pd.DataFrame([0.50, 0.30, 0.80], classes).T + raw_thresholds = pd.DataFrame([1.50, 1.30, 1.80], classes).T smoothed_thresholds = raw_thresholds.apply( lambda x: x/(math.sqrt(7) * 1.5)) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 68687f27b..759008333 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -405,51 +405,125 @@ def test_lt_min_missing_not_allowed(self): err.check_data_id[0] for err in validator.raised_errors] -# class TestCheckMinDate: +class TestCheckRapidChange: + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_same_df(self): + validator = Validator(self.params) + test_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) + ref_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) + validator.check_rapid_change( + test_df, ref_df, date.today(), "geo", "signal") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 0 + def test_0_vs_many(self): + validator = Validator(self.params) -# class TestCheckMaxDate: + time_value = datetime.combine(date.today(), datetime.min.time()) -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + test_df = pd.DataFrame([time_value] * 5, columns=["time_value"]) + ref_df = pd.DataFrame([time_value] * 1, columns=["time_value"]) + validator.check_rapid_change( + test_df, ref_df, time_value, "geo", "signal") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 1 + assert "check_rapid_change_num_rows" in [ + err.check_data_id[0] for err in validator.raised_errors] -# class TestCheckMaxReferenceDate: +class TestCheckAvgValDiffs: + params = {"data_source": "", "start_date": "2020-09-01", + "end_date": "2020-09-02"} -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + def test_same_val(self): + validator = Validator(self.params) -# assert len(self.validator.raised_errors) == 0 + data = {"val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + test_df = pd.DataFrame(data) + ref_df = pd.DataFrame(data) -# class TestCheckRapidChange: + validator.check_avg_val_diffs( + test_df, ref_df, "raw", date.today(), "geo", "signal") -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + assert len(validator.raised_errors) == 0 + + def test_same_se(self): + validator = Validator(self.params) + + data = {"val": [np.nan] * 6, "se": [1, 1, 1, 2, 0, 1], + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + + test_df = pd.DataFrame(data) + ref_df = pd.DataFrame(data) -# assert len(self.validator.raised_errors) == 0 + validator.check_avg_val_diffs( + test_df, ref_df, "raw", date.today(), "geo", "signal") + + assert len(validator.raised_errors) == 0 + + def test_same_n(self): + validator = Validator(self.params) + data = {"val": [np.nan] * 6, "se": [np.nan] * 6, + "sample_size": [1, 1, 1, 2, 0, 1], "geo_id": ["1"] * 6} -# class TestCheckAvgValDiffs: + test_df = pd.DataFrame(data) + ref_df = pd.DataFrame(data) -# def test_empty_df(self): -# validator = Validator() -# empty_df = pd.DataFrame(columns=["val"]) -# self.validator.check_bad_val(empty_df, "") + validator.check_avg_val_diffs( + test_df, ref_df, "raw", date.today(), "geo", "signal") -# assert len(self.validator.raised_errors) == 0 + assert len(validator.raised_errors) == 0 + + def test_10x_val(self): + validator = Validator(self.params) + test_data = {"val": [1, 1, 1, 20, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + ref_data = {"val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + + test_df = pd.DataFrame(test_data) + ref_df = pd.DataFrame(ref_data) + validator.check_avg_val_diffs( + test_df, ref_df, "raw", + datetime.combine(date.today(), datetime.min.time()), "geo", "signal") + + assert len(validator.raised_errors) == 0 + + def test_100x_val(self): + validator = Validator(self.params) + test_data = {"val": [1, 1, 1, 200, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + ref_data = {"val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + + test_df = pd.DataFrame(test_data) + ref_df = pd.DataFrame(ref_data) + validator.check_avg_val_diffs( + test_df, ref_df, "raw", + datetime.combine(date.today(), datetime.min.time()), "geo", "signal") + + assert len(validator.raised_errors) == 1 + assert "check_test_vs_reference_avg_changed" in [ + err.check_data_id[0] for err in validator.raised_errors] + + def test_1000x_val(self): + validator = Validator(self.params) + test_data = {"val": [1, 1, 1, 2000, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + ref_data = {"val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, + "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6} + + test_df = pd.DataFrame(test_data) + ref_df = pd.DataFrame(ref_data) + validator.check_avg_val_diffs( + test_df, ref_df, "raw", + datetime.combine(date.today(), datetime.min.time()), "geo", "signal") + + assert len(validator.raised_errors) == 1 + assert "check_test_vs_reference_avg_changed" in [ + err.check_data_id[0] for err in validator.raised_errors] From ab3586e2f6f3eca93109eafd568007ed95cd6764 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Oct 2020 17:53:27 -0400 Subject: [PATCH 073/151] Fixed geo_id regex problems --- validator/delphi_validator/validate.py | 30 +++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index f14be57c0..5423d811a 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -14,15 +14,15 @@ from .datafetcher import filename_regex, \ read_filenames, load_csv, get_geo_sig_cmbo, \ read_geo_sig_cmbo_files, fetch_api_reference - +import pdb # Recognized geo types. -negated_regex_dict = { - 'county': '^(?!\d{5}).*$', - 'hrr': '^(?!\d{1,3}).*$', - 'msa': '^(?!\d{5}).*$', - 'state': '^(?![a-z]{2}).*$', - 'national': '(?!usa).*$' +geo_regex_dict = { + 'county': '^\d{5}$', + 'hrr': '^\d{1,3}$', + 'msa': '^\d{5}$', + 'state': '^[a-z]{2}$', + 'national': '^usa$' } @@ -220,25 +220,29 @@ def check_bad_geo_id(self, df_to_test, nameformat, geo_type): Returns: - None """ - def find_all_unexpected_geo_ids(df_to_test, negated_regex): + def find_all_unexpected_geo_ids(df_to_test, geo_regex): """ Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. """ - unexpected_geos = [ugeo[0] for ugeo in df_to_test['geo_id'].str.findall( - negated_regex) if len(ugeo) > 0] + expected_geos = [geo[0] for geo in df_to_test['geo_id'].str.findall( + geo_regex) if len(geo) > 0] + + unexpected_geos = {geo for geo in set( + df_to_test['geo_id']) if geo not in expected_geos} + if len(unexpected_geos) > 0: self.raised_errors.append(ValidationError( ("check_geo_id_format", nameformat), - set(unexpected_geos), "Non-conforming geo_ids found")) + unexpected_geos, "Non-conforming geo_ids found")) - if geo_type not in negated_regex_dict: + if geo_type not in geo_regex_dict: self.raised_errors.append(ValidationError( ("check_geo_type", nameformat), geo_type, "Unrecognized geo type")) else: find_all_unexpected_geo_ids( - df_to_test, negated_regex_dict[geo_type]) + df_to_test, geo_regex_dict[geo_type]) def check_bad_val(self, df_to_test, nameformat, signal_type): """ From 9c2f17dc482ccae7499c27472ff7475da0e59f1f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Oct 2020 18:11:43 -0400 Subject: [PATCH 074/151] Finished tests. Linting updates. Moved TODOs to plans.md document --- validator/PLANS.md | 7 ++-- validator/delphi_validator/datafetcher.py | 17 ++++---- validator/delphi_validator/validate.py | 47 ++++++++--------------- 3 files changed, 27 insertions(+), 44 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index d871de82b..686af661d 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -27,15 +27,16 @@ ## Checks + features wishlist, and problems to think about: +* Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). * check for large jumps -* Which, if any, specific geo_ids are missing +* Which, if any, specific geo_ids are missing (get list from historical data) * different thresholds for different files? * use known erroneous/anomalous days of source data to re-adjust thresholds to not pass * check number of observations * tests * check for duplicate rows -* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced -* Data correctness and consistency over longer time periods (weeks to months) +* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values been changed significantly +* Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends * Currently, checks look at a data window of a few days * Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks) for ideas diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 7276fdc93..3bf6f13ec 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -60,8 +60,7 @@ def get_geo_sig_cmbo(data_source): if data_source == 'fb-survey': # Currently metadata returns --*community*-- signals that don't get generated - # in the new fb-pipeline. Seiving them out for now. - # TODO: Include weighted whh_cmnty_cli and wnohh_cmnty_cli + # in the new fb-pipeline. Sieving them out for now. for sig in unique_signals: if "community" in sig: unique_signals.remove(sig) @@ -103,15 +102,15 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): continue # Load data from all found files. - for f in files: - df = load_csv(join(data_folder, f)) - for dt in date_slist: + for file in files: + data_df = load_csv(join(data_folder, file)) + for date in date_slist: # Add data's date, from CSV name, as new column - if f.find(dt) != -1: - gen_dt = datetime.strptime(dt, '%Y%m%d') - df['time_value'] = gen_dt - df_list.append(df) + if file.find(date) != -1: + source_date = datetime.strptime(date, '%Y%m%d') + data_df['time_value'] = source_date + df_list.append(data_df) yield pd.concat(df_list), geo_sig[0], geo_sig[1] diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 5423d811a..482b7add2 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -14,7 +14,6 @@ from .datafetcher import filename_regex, \ read_filenames, load_csv, get_geo_sig_cmbo, \ read_geo_sig_cmbo_files, fetch_api_reference -import pdb # Recognized geo types. geo_regex_dict = { @@ -49,7 +48,7 @@ def make_date_filter(start_date, end_date): start_code = int(start_date.strftime("%Y%m%d")) end_code = int(end_date.strftime("%Y%m%d")) - def f(match): + def temp_function(match): """ Return a boolean of whether a filename of appropriate format contains a date within the specified date range. @@ -71,7 +70,7 @@ def f(match): # Return boolean True if current file date "code" is within the defined date range. return start_code <= code <= end_code - return f + return temp_function class Validator(): @@ -594,23 +593,23 @@ def validate(self, export_dir): # Individual file checks # For every daily file, read in and do some basic format and value checks. for filename, match in validate_files: - df = load_csv(join(export_dir, filename)) + data_df = load_csv(join(export_dir, filename)) - self.check_df_format(df, filename) - self.check_bad_geo_id(df, filename, match.groupdict()['geo_type']) - self.check_bad_val(df, filename, match.groupdict()['signal']) - self.check_bad_se(df, filename) - self.check_bad_sample_size(df, filename) + self.check_df_format(data_df, filename) + self.check_bad_geo_id( + data_df, filename, match.groupdict()['geo_type']) + self.check_bad_val(data_df, filename, match.groupdict()['signal']) + self.check_bad_se(data_df, filename) + self.check_bad_sample_size(data_df, filename) # Get geo_type, date, and signal name as specified by CSV name. - df['geo_type'] = match.groupdict()['geo_type'] - df['date'] = match.groupdict()['date'] - df['signal'] = match.groupdict()['signal'] + data_df['geo_type'] = match.groupdict()['geo_type'] + data_df['date'] = match.groupdict()['date'] + data_df['signal'] = match.groupdict()['signal'] # Add current CSV data to all_frames. - all_frames.append(df) + all_frames.append(data_df) - # TODO: Multi-indexed dataframe for a given (signal, geo_type) all_frames = pd.concat(all_frames) # Get list of dates we expect to see in all the CSV data. @@ -629,12 +628,6 @@ def validate(self, export_dir): smooth_option_regex = re.compile(r'([^_]+)') - # TODO: Remove for actual version - kroc = 0 - - # TODO: Improve efficiency by grouping all_frames by geo and sig instead - # of reading data in again via read_geo_sig_cmbo_files(). - # Comparison checks # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. for geo_sig_df, geo, sig in read_geo_sig_cmbo_files( @@ -643,8 +636,8 @@ def validate(self, export_dir): [name_match_pair[0] for name_match_pair in validate_files], date_slist): - m = smooth_option_regex.match(sig) - smooth_option = m.group(1) + match_obj = smooth_option_regex.match(sig) + smooth_option = match_obj.group(1) if smooth_option not in ('raw', 'smoothed'): smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' @@ -658,11 +651,6 @@ def validate(self, export_dir): self.check_min_allowed_max_date(max_date, weight_option, geo, sig) self.check_max_allowed_max_date(max_date, geo, sig) - # TODO: Check to see, if this date is in the API, if values have been updated - # and changed significantly. - - # TODO: Compare data against long-ago (3 months?) API data for changes in trends. - # Check data from a group of dates against recent (previous 7 days, by default) # data from the API. for checking_date in date_list: @@ -688,11 +676,6 @@ def validate(self, export_dir): self.check_avg_val_diffs( recent_df, reference_api_df, smooth_option, checking_date, geo, sig) - # TODO: Remove for actual version - kroc += 1 - if kroc == 2: - break - self.exit() def exit(self): From e6a2242a5a1f018c31318dfebe160aafb06aa3a0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Oct 2020 18:20:28 -0400 Subject: [PATCH 075/151] Updated params template --- validator/delphi_validator/errors.py | 1 - validator/delphi_validator/validate.py | 6 ++++++ validator/params.json.template | 15 ++++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/validator/delphi_validator/errors.py b/validator/delphi_validator/errors.py index b5ed9e69f..aa688ab54 100644 --- a/validator/delphi_validator/errors.py +++ b/validator/delphi_validator/errors.py @@ -35,4 +35,3 @@ def __init__(self, check_data_id, expression, message): check_data_id, tuple) and not isinstance(check_data_id, list) else tuple(check_data_id) self.expression = expression self.message = message - super().__init__(self.message) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 482b7add2..45a723d56 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -628,6 +628,8 @@ def validate(self, export_dir): smooth_option_regex = re.compile(r'([^_]+)') + kroc = 0 + # Comparison checks # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. for geo_sig_df, geo, sig in read_geo_sig_cmbo_files( @@ -676,6 +678,10 @@ def validate(self, export_dir): self.check_avg_val_diffs( recent_df, reference_api_df, smooth_option, checking_date, geo, sig) + kroc += 1 + if kroc == 2: + break + self.exit() def exit(self): diff --git a/validator/params.json.template b/validator/params.json.template index 128f4d10e..6d2a6d819 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,11 +1,12 @@ { "validation": { - "data_source": "usa-facts", - "start_date": "2020-09-05", - "end_date": "2020-09-08", - "ref_window_size": 7, - "minimum_sample_size": 100, - "missing_se_allowed": true, - "missing_sample_size_allowed": true + "data_source": "usa-facts", + "start_date": "2020-09-05", + "end_date": "2020-09-08", + "ref_window_size": 7, + "minimum_sample_size": 100, + "missing_se_allowed": true, + "missing_sample_size_allowed": true, + "suppressed_errors": [["check_min_max_date", "county", "confirmed_7dav_cumulative_prop"], ["check_val_lt_0", "20200906_county_deaths_7dav_incidence_num.csv"]] } } \ No newline at end of file From c5a99cc6617fc3299f3232e70419a50389bce033 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 10 Oct 2020 12:45:21 -0400 Subject: [PATCH 076/151] Flag kroc counter as temporary. Should be removed when production-ready --- validator/delphi_validator/validate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 45a723d56..3617b0d29 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -628,6 +628,7 @@ def validate(self, export_dir): smooth_option_regex = re.compile(r'([^_]+)') + # TODO: Keeps script from checking all files in a test run. Remove when production-ready. kroc = 0 # Comparison checks From 919cedf4e747e3c5b9b142cf5c52fd8f79bb66d5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 10 Oct 2020 14:16:29 -0400 Subject: [PATCH 077/151] Added params.json flag to toggle test mode on and off. Test mode limits checks to only two data files in the working directory --- validator/delphi_validator/validate.py | 14 +++++++++----- validator/params.json.template | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 3617b0d29..822a8aac0 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -122,6 +122,7 @@ def __init__(self, params): 'sanity_check_rows_per_day', True) self.sanity_check_value_diffs = params.get( 'sanity_check_value_diffs', True) + self.test_mode = params.get("test_mode", False) self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( item, list) else tuple(item) for item in params.get('suppressed_errors', [])} @@ -628,8 +629,9 @@ def validate(self, export_dir): smooth_option_regex = re.compile(r'([^_]+)') - # TODO: Keeps script from checking all files in a test run. Remove when production-ready. - kroc = 0 + # Keeps script from checking all files in a test run. + if self.test_mode: + kroc = 0 # Comparison checks # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. @@ -679,9 +681,11 @@ def validate(self, export_dir): self.check_avg_val_diffs( recent_df, reference_api_df, smooth_option, checking_date, geo, sig) - kroc += 1 - if kroc == 2: - break + # Keeps script from checking all files in a test run. + if self.test_mode: + kroc += 1 + if kroc == 2: + break self.exit() diff --git a/validator/params.json.template b/validator/params.json.template index 6d2a6d819..1695ef9e5 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -7,6 +7,7 @@ "minimum_sample_size": 100, "missing_se_allowed": true, "missing_sample_size_allowed": true, + "test_mode": true, "suppressed_errors": [["check_min_max_date", "county", "confirmed_7dav_cumulative_prop"], ["check_val_lt_0", "20200906_county_deaths_7dav_incidence_num.csv"]] } } \ No newline at end of file From 175f229e649c8d9d7bf23651cb5d280c4ddd4fe9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 10 Oct 2020 14:17:49 -0400 Subject: [PATCH 078/151] plans.md update --- validator/PLANS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/PLANS.md b/validator/PLANS.md index 686af661d..d22001054 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -24,6 +24,7 @@ * Errors are summarized in class attribute and printed on exit * Various check settings are controllable via indicator-specific params.json files * User can manually disable certain checks for certain sets of data using a field in the params.json file +* User can enable test mode (only a small number of data files are checked) using a field in the params.json file ## Checks + features wishlist, and problems to think about: From e8c760659c11735d1c92d9003cd49f173242add9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 10 Oct 2020 14:18:56 -0400 Subject: [PATCH 079/151] params.json update --- validator/params.json.template | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/validator/params.json.template b/validator/params.json.template index 1695ef9e5..657789b5b 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,10 +1,10 @@ { "validation": { - "data_source": "usa-facts", - "start_date": "2020-09-05", - "end_date": "2020-09-08", - "ref_window_size": 7, - "minimum_sample_size": 100, + "data_source": "usa-facts", + "start_date": "2020-09-05", + "end_date": "2020-09-08", + "ref_window_size": 7, + "minimum_sample_size": 100, "missing_se_allowed": true, "missing_sample_size_allowed": true, "test_mode": true, From 6357e66fc250c86cb148cd288ed78d0d509585bf Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 12 Oct 2020 15:16:07 -0400 Subject: [PATCH 080/151] Remove fb-survey "community" sieve. Co-authored-by: krivard --- validator/delphi_validator/datafetcher.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 3bf6f13ec..57f361228 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -58,12 +58,6 @@ def get_geo_sig_cmbo(data_source): unique_signals = source_meta['signal'].unique().tolist() unique_geotypes = source_meta['geo_type'].unique().tolist() - if data_source == 'fb-survey': - # Currently metadata returns --*community*-- signals that don't get generated - # in the new fb-pipeline. Sieving them out for now. - for sig in unique_signals: - if "community" in sig: - unique_signals.remove(sig) geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) print("Number of mixed types:", len(geo_sig_cmbo)) From 12e1318aee71bfceb8860abbce94576e7145756c Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 12 Oct 2020 15:21:48 -0400 Subject: [PATCH 081/151] Update regex format for national geo regions Co-authored-by: krivard --- validator/delphi_validator/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 822a8aac0..754f5ba5f 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -21,7 +21,7 @@ 'hrr': '^\d{1,3}$', 'msa': '^\d{5}$', 'state': '^[a-z]{2}$', - 'national': '^usa$' + 'national': '^[a-z]{2}$' } From 2478681413bb101c6ea3924edfde44c734c0778b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 13 Oct 2020 16:29:22 -0400 Subject: [PATCH 082/151] Added span_length setting and "latest" value for end_date to allow easier automated validation. Accompanying explanation added to README --- validator/README.md | 11 ++--- validator/delphi_validator/validate.py | 65 ++++++++++++++++++-------- validator/params.json.template | 2 +- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/validator/README.md b/validator/README.md index d12a0f36d..fa943c16a 100644 --- a/validator/README.md +++ b/validator/README.md @@ -6,10 +6,7 @@ The validator performs two main tasks: 2) Comparative analysis with recent data from the API to detect any anomalies, such as spikes or significant value differences -The validator validates new source data against daily data that is already written to disk, -making the execution of the validator independent of the pipeline execution. -This creates the additional advantage of validating against multiple -days of daily data for a better cummulative analysis. +The validator validates new source data in CSV format against data pulled from the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). ## Running the Validator @@ -29,10 +26,12 @@ python -m venv env source env/bin/activate pip install ../_delphi_utils_python/. pip install . -pip install -e ../validator +pip install ../validator ``` -All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `start_date`, and `end_date`. The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. +All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `span_length`, and `end_date`. + +The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. `end_date` specifies the last date to be checked; if set to "latest", `end_date` will always be the current date. `span_length` specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated, for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days. To execute the module and validate source data (by default, in `receiving`), run the indicator to generate data files, then run the validator, as follows: diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 754f5ba5f..02d832e0f 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -88,6 +88,7 @@ def __init__(self, params): - data_source: str; data source name, one of https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html - start_date: beginning date of data to check, in datetime date format + - span_length: number of days before the end date to include in checking - end_date: end date of data to check, in datetime date format - generation_date: date that this df_to_test was generated; typically 1 day after the last date in df_to_test @@ -105,10 +106,11 @@ def __init__(self, params): """ # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] - self.start_date = datetime.date( - datetime.strptime(params['start_date'], '%Y-%m-%d')) - self.end_date = datetime.date( - datetime.strptime(params['end_date'], '%Y-%m-%d')) + + span_length = timedelta(days=params['span_length']) + self.end_date = date.today() if params['end_date'] == "latest" else datetime.strptime( + params['end_date'], '%Y-%m-%d').date() + self.start_date = self.end_date - span_length self.generation_date = date.today() self.max_check_lookbehind = timedelta( @@ -403,7 +405,7 @@ def check_min_allowed_max_date(self, max_date, weighted_option, geo, sig): self.raised_errors.append(ValidationError( ("check_min_max_date", geo, sig), max_date.date(), - "most recent date of generated file seems too long ago")) + "date of most recent generated file seems too long ago")) def check_max_allowed_max_date(self, max_date, geo, sig): """ @@ -417,11 +419,11 @@ def check_max_allowed_max_date(self, max_date, geo, sig): Returns: - None """ - if max_date > self.generation_date - timedelta(days=1): + if max_date > self.generation_date: self.raised_errors.append(ValidationError( ("check_max_max_date", geo, sig), max_date.date(), - "most recent date of generated file seems too recent")) + "date of most recent generated file seems too recent")) def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo, sig): """ @@ -449,7 +451,7 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date 'working files have already been compared against the reference, ' + 'that there is a bug somewhere')) - def check_rapid_change(self, df_to_test, df_to_reference, checking_date, geo, sig): + def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date, geo, sig): """ Compare number of obervations per day in test dataframe vs reference dataframe. @@ -513,16 +515,39 @@ def check_avg_val_diffs(self, # For each variable (val, se, and sample size) where not missing, calculate the # relative mean difference and mean absolute difference between the test data # and the reference data across all geographic regions. + # + # Steps: + # - melt: creates a long version of df, where 'variable' specifies variable + # name (val, se, sample size) and 'value' specifies the value of said variable; + # geo_id and type columns are unchanged + # - pivot: each row is the test and reference values for a given geo + # region-variable type combo + # - reset_index: index is set to auto-incrementing int; geo_id and variable + # names are back as normal columns + # - dropna: drop all rows with at least one missing value (makes it + # impossible to compare reference and test) + # - assign: create new temporary columns, raw and abs value of difference + # between test and reference columns + # - groupby: group by variable name + # - agg: for every variable name group (across geo regions), calculate the + # mean of each of the raw difference between test and reference columns, the + # abs value of the difference between test and reference columns, all test + # values, all reference values + # - assign: use the new aggregate vars to calculate the relative mean + # difference, 2 * mean(differences) / difference(means) of two groups. df_all = pd.melt( df_all, id_vars=["geo_id", "type"], value_vars=["val", "se", "sample_size"] - ).pivot(index=("geo_id", "variable"), columns="type", values="value" - ).reset_index(("geo_id", "variable") - ).dropna( + ).pivot( + index=("geo_id", "variable"), columns="type", values="value" + ).reset_index( + ("geo_id", "variable") + ).dropna( ).assign( type_diff=lambda x: x["test"] - x["reference"], abs_type_diff=lambda x: abs(x["type_diff"]) - ).groupby("variable", as_index=False - ).agg( + ).groupby( + "variable", as_index=False + ).agg( mean_type_diff=("type_diff", "mean"), mean_abs_type_diff=("abs_type_diff", "mean"), mean_test_var=("test", "mean"), @@ -613,7 +638,7 @@ def validate(self, export_dir): all_frames = pd.concat(all_frames) - # Get list of dates we expect to see in all the CSV data. + # Get list of dates we expect to see in the source data. date_slist = all_frames['date'].unique().tolist() date_list = list( map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) @@ -649,9 +674,6 @@ def validate(self, export_dir): weight_option = 'weighted' if 'wili' in sig or 'wcli' in sig else 'unweighted' - print("Printing geo_sig_df scenes:", geo_sig_df.shape) - print(geo_sig_df) - max_date = geo_sig_df["time_value"].max() self.check_min_allowed_max_date(max_date, weight_option, geo, sig) self.check_max_allowed_max_date(max_date, geo, sig) @@ -674,7 +696,7 @@ def validate(self, export_dir): recent_df, reference_api_df, checking_date, geo, sig) if self.sanity_check_rows_per_day: - self.check_rapid_change( + self.check_rapid_change_num_rows( recent_df, reference_api_df, checking_date, geo, sig) if self.sanity_check_value_diffs: @@ -698,8 +720,11 @@ def exit(self): subset_raised_errors = [] for val_error in self.raised_errors: - raised_check_id = tuple(item.strftime("%Y-%m-%d") if isinstance( - item, (date, datetime)) else item for item in val_error.check_data_id) + # Convert any dates in check_data_id to strings for the purpose of comparing + # to manually suppressed errors. + raised_check_id = tuple([ + item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) + else item for item in val_error.check_data_id]) if raised_check_id not in self.suppressed_errors: subset_raised_errors.append(val_error) diff --git a/validator/params.json.template b/validator/params.json.template index 657789b5b..402d54b26 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -1,8 +1,8 @@ { "validation": { "data_source": "usa-facts", - "start_date": "2020-09-05", "end_date": "2020-09-08", + "span_length": "7", "ref_window_size": 7, "minimum_sample_size": 100, "missing_se_allowed": true, From caac16efb97b52ad82ab7bca34548eb74dadce22 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 13 Oct 2020 18:30:00 -0400 Subject: [PATCH 083/151] Update __main__.py --- validator/delphi_validator/__main__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/validator/delphi_validator/__main__.py b/validator/delphi_validator/__main__.py index bf03405fe..c7cca0ec9 100644 --- a/validator/delphi_validator/__main__.py +++ b/validator/delphi_validator/__main__.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- """Call the function run_module when executed. -This file indicates that calling the module (`python -m MODULE_NAME`) will -call the function `run_module` found within the run.py file. There should be -no need to change this template. +This file indicates that running the module (`python -m delphi_validator`) will +call the function `run_module` found within the run.py file. """ -from .run import run_module # pragma: no cover +from .run import run_module -run_module() # pragma: no cover +run_module() From 4a7951eb12c1a806a3394f15e4c06c3a8cf06365 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 13 Oct 2020 18:34:52 -0400 Subject: [PATCH 084/151] Update __init__.py --- validator/delphi_validator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/delphi_validator/__init__.py b/validator/delphi_validator/__init__.py index 52a507259..04a4ece92 100644 --- a/validator/delphi_validator/__init__.py +++ b/validator/delphi_validator/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Module to pull and clean indicators from the XXXXX source. +"""Module to validate indicator source data before uploading to the public COVIDcast API. This file defines the functions that are made public by the module. As the module is intended to be executed though the main method, these are primarily From 05c6b0762ce6574b2778d294cb124364ecab70b0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 14 Oct 2020 09:39:04 -0400 Subject: [PATCH 085/151] update params template --- validator/params.json.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/params.json.template b/validator/params.json.template index 402d54b26..c3d029153 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -2,7 +2,7 @@ "validation": { "data_source": "usa-facts", "end_date": "2020-09-08", - "span_length": "7", + "span_length": 7, "ref_window_size": 7, "minimum_sample_size": 100, "missing_se_allowed": true, From 42d6c6b9513803fd01238c233e495ef40307cb6b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 14 Oct 2020 12:40:43 -0400 Subject: [PATCH 086/151] Added user settings to specify names of all smoothed signals for selecting the appropriate avg-value threshold; and to specify number of days lag on a per-signal basis --- validator/README.md | 4 +- validator/delphi_validator/datafetcher.py | 3 +- validator/delphi_validator/validate.py | 55 +++++++++++------------ validator/params.json.template | 35 +++++++++++++-- 4 files changed, 62 insertions(+), 35 deletions(-) diff --git a/validator/README.md b/validator/README.md index fa943c16a..5a28a7058 100644 --- a/validator/README.md +++ b/validator/README.md @@ -29,9 +29,9 @@ pip install . pip install ../validator ``` -All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `span_length`, and `end_date`. +All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `span_length`, `end_date`, and `smoothed_signals`. -The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. `end_date` specifies the last date to be checked; if set to "latest", `end_date` will always be the current date. `span_length` specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated, for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days. +The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. `end_date` specifies the last date to be checked; if set to "latest", `end_date` will always be the current date. `span_length` specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days. For `smoothed_signals`, please list the names of the signals that are smoothed (e.g. 7-day average). To execute the module and validate source data (by default, in `receiving`), run the indicator to generate data files, then run the validator, as follows: diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 57f361228..9f3e8dc25 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -58,9 +58,8 @@ def get_geo_sig_cmbo(data_source): unique_signals = source_meta['signal'].unique().tolist() unique_geotypes = source_meta['geo_type'].unique().tolist() - geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) - print("Number of mixed types:", len(geo_sig_cmbo)) + print("Number of expected geo region-signal combinations:", len(geo_sig_cmbo)) return geo_sig_cmbo diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 02d832e0f..e0a2b376e 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -101,6 +101,10 @@ def __init__(self, params): raise an exception or not - sanity_check_rows_per_day: boolean; check flag - sanity_check_value_diffs: boolean; check flag + - smoothed_signals: set of strings; names of signals that are smoothed (7-day + avg, etc) + - expected_lag: dict of signal names: int pairs; how many days behind do we + expect each signal to be - suppressed_errors: set of check_data_ids used to identify error messages to ignore - raised_errors: list to append errors to as they are raised """ @@ -126,6 +130,9 @@ def __init__(self, params): 'sanity_check_value_diffs', True) self.test_mode = params.get("test_mode", False) + self.smoothed_signals = set(params.get("smoothed_signals", [])) + self.expected_lag = params["expected_lag"] + self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( item, list) else tuple(item) for item in params.get('suppressed_errors', [])} @@ -380,26 +387,20 @@ def check_bad_sample_size(self, df_to_test, nameformat): result, "sample size must be NA or >= {self.minimum_sample_size}")) - def check_min_allowed_max_date(self, max_date, weighted_option, geo, sig): + def check_min_allowed_max_date(self, max_date, geo, sig): """ Check if time since data was generated is reasonable or too long ago. Arguments: - max_date: date of most recent data to be validated; datetime format. - - weighted_option: str; selects the "reasonable" threshold based on signal name - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - sig: str; signal name as in the CSV name Returns: - None """ - switcher = { - 'unweighted': timedelta(days=1), - 'weighted': timedelta(days=4) - } - # Get the setting from switcher dictionary - thres = switcher.get( - weighted_option, lambda: "Invalid weighting option") + thres = timedelta( + days=self.expected_lag[sig] if sig in self.expected_lag else 1) if max_date < self.generation_date - thres: self.raised_errors.append(ValidationError( @@ -471,9 +472,11 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date reference_rows_per_reporting_day = df_to_reference.shape[0] / len( set(df_to_reference["time_value"])) - if abs(reldiff_by_min( - test_rows_per_reporting_day, - reference_rows_per_reporting_day)) > 0.35: + compare_rows = reldiff_by_min( + test_rows_per_reporting_day, + reference_rows_per_reporting_day) + + if abs(compare_rows) > 0.35: self.raised_errors.append(ValidationError( ("check_rapid_change_num_rows", checking_date, geo, sig), (test_rows_per_reporting_day, reference_rows_per_reporting_day), @@ -482,7 +485,6 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date def check_avg_val_diffs(self, df_to_test, df_to_reference, - smooth_option, checking_date, geo, sig): """ @@ -492,8 +494,6 @@ def check_avg_val_diffs(self, - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - smooth_option: "raw" or "smoothed", choosen according to smoothing of signal - (e.g. 7dav is "smoothed") - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - sig: str; signal name as in the CSV name @@ -534,7 +534,7 @@ def check_avg_val_diffs(self, # abs value of the difference between test and reference columns, all test # values, all reference values # - assign: use the new aggregate vars to calculate the relative mean - # difference, 2 * mean(differences) / difference(means) of two groups. + # difference, 2 * mean(differences) / sum(means) of two groups. df_all = pd.melt( df_all, id_vars=["geo_id", "type"], value_vars=["val", "se", "sample_size"] ).pivot( @@ -571,6 +571,7 @@ def check_avg_val_diffs(self, } # Get the selected thresholds from switcher dictionary + smooth_option = "smoothed" if sig in self.smoothed_signals else "raw" thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. @@ -652,8 +653,6 @@ def validate(self, export_dir): # in time, how many days do we use to form the reference statistics. semirecent_lookbehind = timedelta(days=7) - smooth_option_regex = re.compile(r'([^_]+)') - # Keeps script from checking all files in a test run. if self.test_mode: kroc = 0 @@ -666,16 +665,8 @@ def validate(self, export_dir): [name_match_pair[0] for name_match_pair in validate_files], date_slist): - match_obj = smooth_option_regex.match(sig) - smooth_option = match_obj.group(1) - - if smooth_option not in ('raw', 'smoothed'): - smooth_option = 'smoothed' if '7dav' in sig or 'smoothed' in sig else 'raw' - - weight_option = 'weighted' if 'wili' in sig or 'wcli' in sig else 'unweighted' - max_date = geo_sig_df["time_value"].max() - self.check_min_allowed_max_date(max_date, weight_option, geo, sig) + self.check_min_allowed_max_date(max_date, geo, sig) self.check_max_allowed_max_date(max_date, geo, sig) # Check data from a group of dates against recent (previous 7 days, by default) @@ -685,6 +676,14 @@ def validate(self, export_dir): recent_df = geo_sig_df.query( 'time_value <= @checking_date & time_value >= @recent_cutoff_date') + if recent_df.empty: + self.raised_errors.append(ValidationError( + ("check_missing_geo_sig_date_combo", + checking_date, geo, sig), + None, + "Test data for a given checking date-geo-sig combination is missing")) + continue + # Reference dataframe runs backwards from the checking_date reference_start_date = checking_date - \ min(semirecent_lookbehind, self.max_check_lookbehind) @@ -701,7 +700,7 @@ def validate(self, export_dir): if self.sanity_check_value_diffs: self.check_avg_val_diffs( - recent_df, reference_api_df, smooth_option, checking_date, geo, sig) + recent_df, reference_api_df, checking_date, geo, sig) # Keeps script from checking all files in a test run. if self.test_mode: diff --git a/validator/params.json.template b/validator/params.json.template index c3d029153..066beb016 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -2,12 +2,41 @@ "validation": { "data_source": "usa-facts", "end_date": "2020-09-08", - "span_length": 7, + "span_length": 3, "ref_window_size": 7, "minimum_sample_size": 100, "missing_se_allowed": true, "missing_sample_size_allowed": true, + "smoothed_signals": [ + "confirmed_7dav_cumulative_num", + "confirmed_7dav_cumulative_prop", + "confirmed_7dav_incidence_num", + "confirmed_7dav_incidence_prop", + "deaths_7dav_cumulative_num", + "deaths_7dav_cumulative_prop", + "deaths_7dav_incidence_num", + "deaths_7dav_incidence_prop"], + "expected_lag": { + "confirmed_7dav_cumulative_num": 1, + "confirmed_7dav_cumulative_prop": 1, + "confirmed_7dav_incidence_num": 1, + "confirmed_7dav_incidence_prop": 1, + "deaths_7dav_cumulative_num": 1, + "deaths_7dav_cumulative_prop": 1, + "deaths_7dav_incidence_num": 1, + "deaths_7dav_incidence_prop": 1, + "confirmed_cumulative_num": 1, + "confirmed_cumulative_prop": 1, + "confirmed_incidence_num": 1, + "confirmed_incidence_prop": 1, + "deaths_cumulative_num": 1, + "deaths_cumulative_prop": 1, + "deaths_incidence_num": 1, + "deaths_incidence_prop": 1, + }, "test_mode": true, - "suppressed_errors": [["check_min_max_date", "county", "confirmed_7dav_cumulative_prop"], ["check_val_lt_0", "20200906_county_deaths_7dav_incidence_num.csv"]] + "suppressed_errors": [ + ["check_min_max_date", "county", "confirmed_7dav_cumulative_prop"], + ["check_val_lt_0", "20200906_county_deaths_7dav_incidence_num.csv"]] } -} \ No newline at end of file +} From 37ad56bcd3ecf8f3247df3241cfb5f34699fd970 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 14 Oct 2020 18:52:52 -0400 Subject: [PATCH 087/151] Update plans doc. Add info about user settings to readme --- validator/PLANS.md | 28 +++++++++++++++----------- validator/README.md | 20 ++++++++++++++---- validator/delphi_validator/validate.py | 4 ++-- validator/params.json.template | 3 +-- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index d22001054..6db424b17 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -3,7 +3,7 @@ ## Current checks for indicator source data * Missing dates within the selected range -* Appropriate file name +* Recognized file name format * Recognized geographical type (county, state, etc) * Recognized geo id format (e.g. state is two lowercase letters) * Missing geo type + signal + date combos based on the geo type + signal combos Covidcast metadata says should be available @@ -16,34 +16,38 @@ * If signal and stderr both = 0 (seen in Quidel data due to lack of Jeffreys correction, [issue 255](https://github.com/cmu-delphi/covidcast-indicators/issues/255#issuecomment-692196541)) * Missing ‘sample_size’ values * Appropriate ‘sample_size’ values, ≥ 100 (default) or user-defined threshold +* Most recent date seen in source data is recent enough, < 1 day ago (default) or user-defined on a per-signal basis +* Most recent date seen in source data is not in the future +* Most recent date seen in source data is not older than most recent date seen in reference data * Similar number of obs per day as recent API data (static threshold) -* Similar average values as API data (static threshold) +* Similar average value as API data (static threshold) + ## Current features * Errors are summarized in class attribute and printed on exit * Various check settings are controllable via indicator-specific params.json files * User can manually disable certain checks for certain sets of data using a field in the params.json file -* User can enable test mode (only a small number of data files are checked) using a field in the params.json file +* User can enable test mode (checks only a small number of data files) using a field in the params.json file ## Checks + features wishlist, and problems to think about: * Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). -* check for large jumps +* Check for large jumps * Which, if any, specific geo_ids are missing (get list from historical data) -* different thresholds for different files? -* use known erroneous/anomalous days of source data to re-adjust thresholds to not pass -* check number of observations -* tests -* check for duplicate rows +* Different test thresholds for different files? Currently some control based on smoothed vs raw signals +* Use known erroneous/anomalous days of source data to tune static thresholds +* Check for duplicate rows * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values been changed significantly * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. - * Long-term trends - * Currently, checks look at a data window of a few days + * Long-term trends. Currently, checks only look at a data window of a few days * Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks) for ideas * E.g. Doctor visits decreasing correlation with cases * E.g. WY/RI missing or very low compared to historical * Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. -* Order raised exceptions by p-value, correcting for multiple testing +* Order raised exceptions by p-value +* Correct p-values for multiple testing +* Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (differeng geo regions, e.g.) are "close" to significant * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first +* Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) diff --git a/validator/README.md b/validator/README.md index 5a28a7058..0c10ec836 100644 --- a/validator/README.md +++ b/validator/README.md @@ -29,10 +29,6 @@ pip install . pip install ../validator ``` -All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. Working defaults are provided for all but `data_source`, `span_length`, `end_date`, and `smoothed_signals`. - -The `data_source` should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls. `end_date` specifies the last date to be checked; if set to "latest", `end_date` will always be the current date. `span_length` specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days. For `smoothed_signals`, please list the names of the signals that are smoothed (e.g. 7-day average). - To execute the module and validate source data (by default, in `receiving`), run the indicator to generate data files, then run the validator, as follows: @@ -49,6 +45,22 @@ deactivate rm -r env ``` +### Customization + +All of the user-changable parameters are stored in the `validation` field of the indicator's `params.json` file. If `params.json` does not already include a `validation` field, please copy that provided in this module's `params.json.template`. + +Please update the follow settings: + +* `data_source`: should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls +* `end_date`: specifies the last date to be checked; if set to "latest", `end_date` will always be the current date +* `span_length`: specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days +* `smoothed_signals`: list of the names of the signals that are smoothed (e.g. 7-day average) +* `expected_lag`: dictionary of signal name-int pairs specifying the number of days of expected lag (time between event occurrence and when data about that event was published) for that signal +* `test_mode`: boolean; `true` checks only a small number of data files +* `suppressed_errors`: list of lists uniquely specifying errors that have been manually verified as false positives or acceptable + +All other fields contain working defaults. + ## Testing the code To test the code, please create a new virtual environment in the main module directory using the following procedure, similar to above: diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index e0a2b376e..768e04ab8 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -48,7 +48,7 @@ def make_date_filter(start_date, end_date): start_code = int(start_date.strftime("%Y%m%d")) end_code = int(end_date.strftime("%Y%m%d")) - def temp_function(match): + def date_filter(match): """ Return a boolean of whether a filename of appropriate format contains a date within the specified date range. @@ -70,7 +70,7 @@ def temp_function(match): # Return boolean True if current file date "code" is within the defined date range. return start_code <= code <= end_code - return temp_function + return date_filter class Validator(): diff --git a/validator/params.json.template b/validator/params.json.template index 066beb016..2b71c0f89 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -32,8 +32,7 @@ "deaths_cumulative_num": 1, "deaths_cumulative_prop": 1, "deaths_incidence_num": 1, - "deaths_incidence_prop": 1, - }, + "deaths_incidence_prop": 1}, "test_mode": true, "suppressed_errors": [ ["check_min_max_date", "county", "confirmed_7dav_cumulative_prop"], From 8027a19edd438cef69829391c3b7c28f9c137824 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 15 Oct 2020 10:22:00 -0400 Subject: [PATCH 088/151] updated tests --- validator/README.md | 10 ++--- validator/delphi_validator/validate.py | 11 ++--- validator/tests/test_checks.py | 62 +++++++++++++------------- 3 files changed, 40 insertions(+), 43 deletions(-) diff --git a/validator/README.md b/validator/README.md index 0c10ec836..8b72a7ab9 100644 --- a/validator/README.md +++ b/validator/README.md @@ -57,9 +57,9 @@ Please update the follow settings: * `smoothed_signals`: list of the names of the signals that are smoothed (e.g. 7-day average) * `expected_lag`: dictionary of signal name-int pairs specifying the number of days of expected lag (time between event occurrence and when data about that event was published) for that signal * `test_mode`: boolean; `true` checks only a small number of data files -* `suppressed_errors`: list of lists uniquely specifying errors that have been manually verified as false positives or acceptable +* `suppressed_errors`: list of lists uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected -All other fields contain working defaults. +All other fields contain working defaults, to be modified as needed. ## Testing the code @@ -101,12 +101,12 @@ The output will show the number of unit tests that passed and failed, along with ## Adding checks -To add a new validation check, define the check as a `Validator` class method in `validate.py`. Each check should append a descriptive error message to the `raised` attribute if triggered. All checks should allow the user to override exception raising for a specific file using the `exception_override` setting in `params.json`. +To add a new validation check, define the check as a `Validator` class method in `validate.py`. Each check should append a descriptive error message to the `raised` attribute if triggered. All checks should allow the user to override exception raising for a specific file using the `suppressed_errors` setting in `params.json`. This features requires that the `check_data_id` defined for an error uniquely identifies that combination of check and test data. This usually takes the form of a tuple of strings with the check method and test identifier, and test data filename or date, geo type, and signal name. Add the newly defined check to the `validate()` method to be executed. It should go in one of three sections: * data sanity checks where a data file is compared against static format settings, -* data trend and value checks where a set of data is compared against recent API data, from the previous few days, -* data trend and value checks where a set of data is compared against long term API data, from a few months ago \ No newline at end of file +* data trend and value checks where a set of source data (can be one or several days) is compared against recent API data, from the previous few days, +* data trend and value checks where a set of source data is compared against long term API data, from the last few months \ No newline at end of file diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 768e04ab8..b94f13d6a 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -483,10 +483,7 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date "Number of rows per day (-with-any-rows) seems to have changed " + "rapidly (reference vs test data)")) - def check_avg_val_diffs(self, - df_to_test, df_to_reference, - checking_date, - geo, sig): + def check_avg_val_diffs(self, df_to_test, df_to_reference, checking_date, geo, sig): """ Compare average values for each variable in test dataframe vs reference dataframe. @@ -609,9 +606,6 @@ def validate(self, export_dir): # Make list of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] - # Get all expected combinations of geo_type and signal. - geo_sig_cmbo = get_geo_sig_cmbo(self.data_source) - self.check_missing_dates(validate_files) self.check_settings() @@ -653,6 +647,9 @@ def validate(self, export_dir): # in time, how many days do we use to form the reference statistics. semirecent_lookbehind = timedelta(days=7) + # Get all expected combinations of geo_type and signal. + geo_sig_cmbo = get_geo_sig_cmbo(self.data_source) + # Keeps script from checking all files in a test run. if self.test_mode: kroc = 0 diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 759008333..5a0b4c5e8 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -59,8 +59,8 @@ def test_empty(self): class TestValidatorInitialization: def test_default_settings(self): - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-01"} + params = {"data_source": "", "span_length": 0, + "end_date": "2020-09-01", "expected_lag": {}} validator = Validator(params) assert validator.max_check_lookbehind == timedelta(days=7) @@ -77,8 +77,8 @@ def test_default_settings(self): class TestCheckMissingDates: def test_empty_filelist(self): - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-09"} + params = {"data_source": "", "span_length": 8, + "end_date": "2020-09-09", "expected_lag": {}} validator = Validator(params) filenames = list() @@ -90,8 +90,8 @@ def test_empty_filelist(self): assert len(validator.raised_errors[0].expression) == 9 def test_same_day(self): - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-01"} + params = {"data_source": "", "span_length": 0, + "end_date": "2020-09-01", "expected_lag": {}} validator = Validator(params) filenames = [("20200901_county_signal_signal.csv", "match_obj")] @@ -102,8 +102,8 @@ def test_same_day(self): err.check_data_id[0] for err in validator.raised_errors] def test_duplicate_dates(self): - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} validator = Validator(params) filenames = [("20200901_county_signal_signal.csv", "match_obj"), @@ -147,8 +147,8 @@ def test_expected_groups(self): class TestCheckBadGeoId: - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 0, + "end_date": "2020-09-02", "expected_lag": {}} def test_empty_df(self): validator = Validator(self.params) @@ -218,19 +218,19 @@ def test_invalid_geo_id_state(self): def test_invalid_geo_id_national(self): validator = Validator(self.params) - df = pd.DataFrame(["usa", "USA", " usa", "us", - "usausa", "America"], columns=["geo_id"]) + df = pd.DataFrame(["usa", "SP", " us", "us", + "usausa", "US"], columns=["geo_id"]) validator.check_bad_geo_id(df, "name", "national") assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id assert len(validator.raised_errors[0].expression) == 5 - assert "usa" not in validator.raised_errors[0].expression + assert "us" not in validator.raised_errors[0].expression class TestCheckBadVal: - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} def test_empty_df(self): validator = Validator(self.params) @@ -275,8 +275,8 @@ def test_gt_max_prop(self): class TestCheckBadSe: - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} def test_empty_df(self): validator = Validator(self.params) @@ -350,8 +350,8 @@ def test_jeffreys(self): class TestCheckBadN: - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} def test_empty_df(self): validator = Validator(self.params) @@ -406,14 +406,14 @@ def test_lt_min_missing_not_allowed(self): class TestCheckRapidChange: - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} def test_same_df(self): validator = Validator(self.params) test_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) ref_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) - validator.check_rapid_change( + validator.check_rapid_change_num_rows( test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -425,7 +425,7 @@ def test_0_vs_many(self): test_df = pd.DataFrame([time_value] * 5, columns=["time_value"]) ref_df = pd.DataFrame([time_value] * 1, columns=["time_value"]) - validator.check_rapid_change( + validator.check_rapid_change_num_rows( test_df, ref_df, time_value, "geo", "signal") assert len(validator.raised_errors) == 1 @@ -434,8 +434,8 @@ def test_0_vs_many(self): class TestCheckAvgValDiffs: - params = {"data_source": "", "start_date": "2020-09-01", - "end_date": "2020-09-02"} + params = {"data_source": "", "span_length": 1, + "end_date": "2020-09-02", "expected_lag": {}} def test_same_val(self): validator = Validator(self.params) @@ -447,7 +447,7 @@ def test_same_val(self): ref_df = pd.DataFrame(data) validator.check_avg_val_diffs( - test_df, ref_df, "raw", date.today(), "geo", "signal") + test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -461,7 +461,7 @@ def test_same_se(self): ref_df = pd.DataFrame(data) validator.check_avg_val_diffs( - test_df, ref_df, "raw", date.today(), "geo", "signal") + test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -475,7 +475,7 @@ def test_same_n(self): ref_df = pd.DataFrame(data) validator.check_avg_val_diffs( - test_df, ref_df, "raw", date.today(), "geo", "signal") + test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -489,7 +489,7 @@ def test_10x_val(self): test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) validator.check_avg_val_diffs( - test_df, ref_df, "raw", + test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -504,7 +504,7 @@ def test_100x_val(self): test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) validator.check_avg_val_diffs( - test_df, ref_df, "raw", + test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") assert len(validator.raised_errors) == 1 @@ -521,7 +521,7 @@ def test_1000x_val(self): test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) validator.check_avg_val_diffs( - test_df, ref_df, "raw", + test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") assert len(validator.raised_errors) == 1 From 0f0938b8e007351552b55381bfde8a1d64d68872 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 15 Oct 2020 10:32:50 -0400 Subject: [PATCH 089/151] plan updates --- validator/PLANS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 6db424b17..65a45107a 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -33,7 +33,7 @@ ## Checks + features wishlist, and problems to think about: * Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). -* Check for large jumps +* Check explicitly for large spikes (avg_val check can detect large jumps and especially large spikes) * Which, if any, specific geo_ids are missing (get list from historical data) * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Use known erroneous/anomalous days of source data to tune static thresholds @@ -44,10 +44,10 @@ * Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks) for ideas * E.g. Doctor visits decreasing correlation with cases * E.g. WY/RI missing or very low compared to historical -* Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. -* Order raised exceptions by p-value -* Correct p-values for multiple testing -* Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (differeng geo regions, e.g.) are "close" to significant +* Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. See [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1601307675021000?thread_ts=1600277030.103500&cid=CV1SYBC90) and [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1600978037007500?thread_ts=1600277030.103500&cid=CV1SYBC90) for more background. + * Order raised exceptions by p-value + * Correct p-values for multiple testing + * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (differeng geo regions, e.g.) are "close" to significant * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) From 15463c325d752c0f3e9d77de5b3c63f469ab1384 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 20 Oct 2020 15:16:22 -0400 Subject: [PATCH 090/151] Lengthen abbreviated obj names. Print total number of tests run --- validator/delphi_validator/datafetcher.py | 49 ++++--- validator/delphi_validator/validate.py | 158 +++++++++++++++------- validator/tests/test_checks.py | 12 +- 3 files changed, 145 insertions(+), 74 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 9f3e8dc25..b32fa21f0 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -48,29 +48,30 @@ def load_csv(path): }) -def get_geo_sig_cmbo(data_source): +def get_geo_signal_combos(data_source): """ Get list of geo type-signal type combinations that we expect to see, based on - combinations reported available by Covidcast metadata. + combinations reported available by COVIDcast metadata. """ meta = covidcast.metadata() source_meta = meta[meta['data_source'] == data_source] unique_signals = source_meta['signal'].unique().tolist() unique_geotypes = source_meta['geo_type'].unique().tolist() - geo_sig_cmbo = list(product(unique_geotypes, unique_signals)) - print("Number of expected geo region-signal combinations:", len(geo_sig_cmbo)) + geo_signal_combos = list(product(unique_geotypes, unique_signals)) + print("Number of expected geo region-signal combinations:", + len(geo_signal_combos)) - return geo_sig_cmbo + return geo_signal_combos -def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): +def read_geo_signal_combo_files(geo_signal_combos, data_folder, filenames, date_slist): """ - Generator that assembles data within the specified date range for a given geo_sig_cmbo. + Generator that assembles data within the specified date range for a given geo_signal_combo. Arguments: - - geo_sig_cmbo: list of geo type-signal type combinations that we expect to see, - based on combinations reported available by Covidcast metadata + - geo_signal_combos: list of geo type-signal type combinations that we expect to see, + based on combinations reported available by COVIDcast metadata - data_folder: path to the directory containing CSV data files. - filenames: list of filenames - date_slist: list of dates (formatted as strings) to check @@ -81,17 +82,18 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): - relevant geo type (str) - relevant signal type (str) """ - for geo_sig in geo_sig_cmbo: + for geo_signal_combo in geo_signal_combos: df_list = list() # Get all filenames for this geo_type and signal_type - files = [file for file in filenames if geo_sig[0] - in file and geo_sig[1] in file] + files = [file for file in filenames if geo_signal_combo[0] + in file and geo_signal_combo[1] in file] if len(files) == 0: print("FILE_NOT_FOUND: File with geo_type:", - geo_sig[0], " and signal:", geo_sig[1], " does not exist!") - yield pd.DataFrame(), geo_sig[0], geo_sig[1] + geo_signal_combo[0], " and signal:", geo_signal_combo[1], + " does not exist!") + yield pd.DataFrame(), geo_signal_combo[0], geo_signal_combo[1] continue # Load data from all found files. @@ -105,23 +107,23 @@ def read_geo_sig_cmbo_files(geo_sig_cmbo, data_folder, filenames, date_slist): data_df['time_value'] = source_date df_list.append(data_df) - yield pd.concat(df_list), geo_sig[0], geo_sig[1] + yield pd.concat(df_list), geo_signal_combo[0], geo_signal_combo[1] -def fetch_api_reference(data_source, start_date, end_date, geo, sig): +def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type): """ Get and process API data for use as a reference. Formatting is changed to match that of source data CSVs. """ api_df = covidcast.signal( - data_source, sig, start_date, end_date, geo) + data_source, signal_type, start_date, end_date, geo_type) if not isinstance(api_df, pd.DataFrame): custom_msg = "Error fetching data from " + str(start_date) + \ " to " + str(end_date) + \ "for data source: " + data_source + \ - ", signal-type: " + sig + \ - ", geography-type: " + geo + ", signal-type: " + signal_type + \ + ", geography-type: " + geo_type raise APIDataFetchError(custom_msg) @@ -131,8 +133,11 @@ def fetch_api_reference(data_source, start_date, end_date, geo, sig): # Replace None with NA to make numerical manipulation easier. # Rename and reorder columns to match those in df_to_test. api_df = api_df.replace( - to_replace=[None], value=np.nan).rename( - columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'}).drop( - ['direction', 'issue', 'lag'], axis=1).reindex(columns=column_names) + to_replace=[None], value=np.nan + ).rename( + columns={'geo_value': "geo_id", 'stderr': 'se', 'value': 'val'} + ).drop( + ['direction', 'issue', 'lag'], axis=1 + ).reindex(columns=column_names) return api_df diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index b94f13d6a..c54985830 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -12,8 +12,8 @@ from .errors import ValidationError from .datafetcher import filename_regex, \ - read_filenames, load_csv, get_geo_sig_cmbo, \ - read_geo_sig_cmbo_files, fetch_api_reference + read_filenames, load_csv, get_geo_signal_combos, \ + read_geo_signal_combo_files, fetch_api_reference # Recognized geo types. geo_regex_dict = { @@ -25,7 +25,7 @@ } -def reldiff_by_min(x, y): +def relative_difference_by_min(x, y): """ Calculate relative difference between two numbers. """ @@ -48,7 +48,7 @@ def make_date_filter(start_date, end_date): start_code = int(start_date.strftime("%Y%m%d")) end_code = int(end_date.strftime("%Y%m%d")) - def date_filter(match): + def custom_date_filter(match): """ Return a boolean of whether a filename of appropriate format contains a date within the specified date range. @@ -70,7 +70,7 @@ def date_filter(match): # Return boolean True if current file date "code" is within the defined date range. return start_code <= code <= end_code - return date_filter + return custom_date_filter class Validator(): @@ -111,12 +111,14 @@ def __init__(self, params): # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] + # Date/time settings span_length = timedelta(days=params['span_length']) self.end_date = date.today() if params['end_date'] == "latest" else datetime.strptime( params['end_date'], '%Y-%m-%d').date() self.start_date = self.end_date - span_length self.generation_date = date.today() + # General options: flags, thresholds self.max_check_lookbehind = timedelta( days=params.get("ref_window_size", 7)) self.minimum_sample_size = params.get('minimum_sample_size', 100) @@ -130,13 +132,20 @@ def __init__(self, params): 'sanity_check_value_diffs', True) self.test_mode = params.get("test_mode", False) + # Signal-specific settings self.smoothed_signals = set(params.get("smoothed_signals", [])) self.expected_lag = params["expected_lag"] self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( item, list) else tuple(item) for item in params.get('suppressed_errors', [])} + # Output self.raised_errors = [] + self.total_checks = 0 + + def increment_total_checks(self): + """ Add 1 to total_checks counter """ + self.total_checks += 1 def check_missing_dates(self, daily_filenames): """ @@ -169,6 +178,8 @@ def check_missing_dates(self, daily_filenames): "Missing dates are observed; if these dates are" + " already in the API they would not be updated")) + self.increment_total_checks() + def check_settings(self): """ Perform some automated format & sanity checks of parameters. @@ -185,16 +196,22 @@ def check_settings(self): self.max_check_lookbehind, "max_check_lookbehind must be of type datetime.timedelta")) + self.increment_total_checks() + if not isinstance(self.generation_date, date): self.raised_errors.append(ValidationError( ("check_type_generation_date"), self.generation_date, "generation_date must be a datetime.date type")) + self.increment_total_checks() + if self.generation_date > date.today(): self.raised_errors.append(ValidationError( ("check_future_generation_date"), self.generation_date, "generation_date must not be in the future")) + self.increment_total_checks() + def check_df_format(self, df_to_test, nameformat): """ Check basic format of source data CSV df. @@ -213,11 +230,15 @@ def check_df_format(self, df_to_test, nameformat): ("check_filename_format", nameformat), nameformat, 'nameformat not recognized')) + self.increment_total_checks() + if not isinstance(df_to_test, pd.DataFrame): self.raised_errors.append(ValidationError( ("check_file_data_format", nameformat), type(df_to_test), 'df_to_test must be a pandas dataframe.')) + self.increment_total_checks() + def check_bad_geo_id(self, df_to_test, nameformat, geo_type): """ Check validity of geo type and values, according to regex pattern. @@ -253,6 +274,8 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex): find_all_unexpected_geo_ids( df_to_test, geo_regex_dict[geo_type]) + self.increment_total_checks() + def check_bad_val(self, df_to_test, nameformat, signal_type): """ Check value field for validity. @@ -275,6 +298,8 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): df_to_test[(df_to_test['val'] > 100)], "val column can't have any cell greater than 100 for percents")) + self.increment_total_checks() + if proportion_option: if not df_to_test[(df_to_test['val'] > 100000)].empty: self.raised_errors.append(ValidationError( @@ -282,17 +307,23 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): df_to_test[(df_to_test['val'] > 100000)], "val column can't have any cell greater than 100000 for proportions")) + self.increment_total_checks() + if df_to_test['val'].isnull().values.any(): self.raised_errors.append(ValidationError( ("check_val_missing", nameformat), None, "val column can't have any cell that is NA")) + self.increment_total_checks() + if not df_to_test[(df_to_test['val'] < 0)].empty: self.raised_errors.append(ValidationError( ("check_val_lt_0", nameformat), df_to_test[(df_to_test['val'] < 0)], "val column can't have any cell smaller than 0")) + self.increment_total_checks() + def check_bad_se(self, df_to_test, nameformat): """ Check standard errors for validity. @@ -322,11 +353,15 @@ def check_bad_se(self, df_to_test, nameformat): ("check_se_not_missing_and_in_range", nameformat), result, "se must be in (0, min(50,val*(1+eps))] and not missing")) + self.increment_total_checks() + if df_to_test["se"].isnull().mean() > 0.5: self.raised_errors.append(ValidationError( ("check_se_many_missing", nameformat), None, 'Recent se values are >50% NA')) + self.increment_total_checks() + elif self.missing_se_allowed: result = df_to_test.query( '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') @@ -336,6 +371,8 @@ def check_bad_se(self, df_to_test, nameformat): ("check_se_missing_or_in_range", nameformat), result, "se must be NA or in (0, min(50,val*(1+eps))]")) + self.increment_total_checks() + result_jeffreys = df_to_test.query('(val == 0) & (se == 0)') result_alt = df_to_test.query('se == 0') @@ -344,12 +381,16 @@ def check_bad_se(self, df_to_test, nameformat): ("check_se_0_when_val_0", nameformat), None, "when signal value is 0, se must be non-zero. please " - + "use Jeffreys correction to generate an appropriate se")) + + "use Jeffreys correction to generate an appropriate se" + + " (see wikipedia.org/wiki/Binomial_proportion_confidence" + + "_interval#Jeffreys_interval for details)")) elif not result_alt.empty: self.raised_errors.append(ValidationError( ("check_se_0", nameformat), result_alt, "se must be non-zero")) + self.increment_total_checks() + def check_bad_sample_size(self, df_to_test, nameformat): """ Check sample sizes for validity. @@ -368,6 +409,8 @@ def check_bad_sample_size(self, df_to_test, nameformat): ("check_n_missing", nameformat), None, "sample_size must not be NA")) + self.increment_total_checks() + # Find rows with sample size less than minimum allowed result = df_to_test.query( '(sample_size < @self.minimum_sample_size)') @@ -377,6 +420,8 @@ def check_bad_sample_size(self, df_to_test, nameformat): ("check_n_gt_min", nameformat), result, "sample size must be >= {self.minimum_sample_size}")) + self.increment_total_checks() + elif self.missing_sample_size_allowed: result = df_to_test.query( '~(sample_size.isnull() | (sample_size >= @self.minimum_sample_size))') @@ -387,46 +432,54 @@ def check_bad_sample_size(self, df_to_test, nameformat): result, "sample size must be NA or >= {self.minimum_sample_size}")) - def check_min_allowed_max_date(self, max_date, geo, sig): + self.increment_total_checks() + + def check_min_allowed_max_date(self, max_date, geo_type, signal_type): """ Check if time since data was generated is reasonable or too long ago. Arguments: - max_date: date of most recent data to be validated; datetime format. - - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - - sig: str; signal name as in the CSV name + - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name + - signal_type: str; signal name as in the CSV name Returns: - None """ thres = timedelta( - days=self.expected_lag[sig] if sig in self.expected_lag else 1) + days=self.expected_lag[signal_type] if signal_type in self.expected_lag + else 1) if max_date < self.generation_date - thres: self.raised_errors.append(ValidationError( - ("check_min_max_date", geo, sig), + ("check_min_max_date", geo_type, signal_type), max_date.date(), "date of most recent generated file seems too long ago")) - def check_max_allowed_max_date(self, max_date, geo, sig): + self.increment_total_checks() + + def check_max_allowed_max_date(self, max_date, geo_type, signal_type): """ Check if time since data was generated is reasonable or too recent. Arguments: - max_date: date of most recent data to be validated; datetime format. - - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - - sig: str; signal name as in the CSV name + - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name + - signal_type: str; signal name as in the CSV name Returns: - None """ if max_date > self.generation_date: self.raised_errors.append(ValidationError( - ("check_max_max_date", geo, sig), + ("check_max_max_date", geo_type, signal_type), max_date.date(), "date of most recent generated file seems too recent")) - def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo, sig): + self.increment_total_checks() + + def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, + geo_type, signal_type): """ Check if reference data is more recent than test data. @@ -435,15 +488,16 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date (one day-signal-geo_type combo) - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - - sig: str; signal name as in the CSV name + - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name + - signal_type: str; signal name as in the CSV name Returns: - None """ if df_to_test["time_value"].max() < df_to_reference["time_value"].max(): self.raised_errors.append(ValidationError( - ("check_max_date_vs_reference", checking_date.date(), geo, sig), + ("check_max_date_vs_reference", + checking_date.date(), geo_type, signal_type), (df_to_test["time_value"].max(), df_to_reference["time_value"].max()), 'reference df has days beyond the max date in the =df_to_test=; ' + @@ -452,7 +506,10 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date 'working files have already been compared against the reference, ' + 'that there is a bug somewhere')) - def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date, geo, sig): + self.increment_total_checks() + + def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date, + geo_type, signal_type): """ Compare number of obervations per day in test dataframe vs reference dataframe. @@ -461,8 +518,8 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - checking_date: datetime date - - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - - sig: str; signal name as in the CSV name + - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name + - signal_type: str; signal name as in the CSV name Returns: - None @@ -472,18 +529,22 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date reference_rows_per_reporting_day = df_to_reference.shape[0] / len( set(df_to_reference["time_value"])) - compare_rows = reldiff_by_min( + compare_rows = relative_difference_by_min( test_rows_per_reporting_day, reference_rows_per_reporting_day) if abs(compare_rows) > 0.35: self.raised_errors.append(ValidationError( - ("check_rapid_change_num_rows", checking_date, geo, sig), + ("check_rapid_change_num_rows", + checking_date, geo_type, signal_type), (test_rows_per_reporting_day, reference_rows_per_reporting_day), "Number of rows per day (-with-any-rows) seems to have changed " + "rapidly (reference vs test data)")) - def check_avg_val_diffs(self, df_to_test, df_to_reference, checking_date, geo, sig): + self.increment_total_checks() + + def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, geo_type, + signal_type): """ Compare average values for each variable in test dataframe vs reference dataframe. @@ -491,8 +552,8 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, checking_date, geo, s - df_to_test: pandas dataframe of CSV source data - df_to_reference: pandas dataframe of reference data, either from the COVIDcast API or semirecent data - - geo: str; geo type name (county, msa, hrr, state) as in the CSV name - - sig: str; signal name as in the CSV name + - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name + - signal_type: str; signal name as in the CSV name Returns: - None @@ -568,7 +629,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, checking_date, geo, s } # Get the selected thresholds from switcher dictionary - smooth_option = "smoothed" if sig in self.smoothed_signals else "raw" + smooth_option = "smoothed" if signal_type in self.smoothed_signals else "raw" thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. @@ -581,7 +642,7 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, checking_date, geo, s if mean_stddiff_high or mean_stdabsdiff_high: self.raised_errors.append(ValidationError( ("check_test_vs_reference_avg_changed", - checking_date.date(), geo, sig), + checking_date.date(), geo_type, signal_type), (mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data ' + '(either semirecent or from API) seem large --- either large increase ' @@ -589,6 +650,8 @@ def check_avg_val_diffs(self, df_to_test, df_to_reference, checking_date, geo, s + 'to average values of corresponding variables. For the former check, ' + 'tolerances for `val` are more restrictive than those for other columns.')) + self.increment_total_checks() + def validate(self, export_dir): """ Runs all data checks. @@ -648,7 +711,7 @@ def validate(self, export_dir): semirecent_lookbehind = timedelta(days=7) # Get all expected combinations of geo_type and signal. - geo_sig_cmbo = get_geo_sig_cmbo(self.data_source) + geo_signal_combos = get_geo_signal_combos(self.data_source) # Keeps script from checking all files in a test run. if self.test_mode: @@ -656,15 +719,15 @@ def validate(self, export_dir): # Comparison checks # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. - for geo_sig_df, geo, sig in read_geo_sig_cmbo_files( - geo_sig_cmbo, + for geo_sig_df, geo_type, signal_type in read_geo_signal_combo_files( + geo_signal_combos, export_dir, [name_match_pair[0] for name_match_pair in validate_files], date_slist): max_date = geo_sig_df["time_value"].max() - self.check_min_allowed_max_date(max_date, geo, sig) - self.check_max_allowed_max_date(max_date, geo, sig) + self.check_min_allowed_max_date(max_date, geo_type, signal_type) + self.check_max_allowed_max_date(max_date, geo_type, signal_type) # Check data from a group of dates against recent (previous 7 days, by default) # data from the API. @@ -673,12 +736,15 @@ def validate(self, export_dir): recent_df = geo_sig_df.query( 'time_value <= @checking_date & time_value >= @recent_cutoff_date') + self.increment_total_checks() + if recent_df.empty: self.raised_errors.append(ValidationError( ("check_missing_geo_sig_date_combo", - checking_date, geo, sig), + checking_date, geo_type, signal_type), None, - "Test data for a given checking date-geo-sig combination is missing")) + "Test data for a given checking date-geo type-signal type" + + " combination is missing")) continue # Reference dataframe runs backwards from the checking_date @@ -686,18 +752,19 @@ def validate(self, export_dir): min(semirecent_lookbehind, self.max_check_lookbehind) reference_end_date = recent_cutoff_date - timedelta(days=1) reference_api_df = fetch_api_reference( - self.data_source, reference_start_date, reference_end_date, geo, sig) + self.data_source, reference_start_date, reference_end_date, + geo_type, signal_type) self.check_max_date_vs_reference( - recent_df, reference_api_df, checking_date, geo, sig) + recent_df, reference_api_df, checking_date, geo_type, signal_type) if self.sanity_check_rows_per_day: self.check_rapid_change_num_rows( - recent_df, reference_api_df, checking_date, geo, sig) + recent_df, reference_api_df, checking_date, geo_type, signal_type) if self.sanity_check_value_diffs: - self.check_avg_val_diffs( - recent_df, reference_api_df, checking_date, geo, sig) + self.check_avg_val_vs_reference( + recent_df, reference_api_df, checking_date, geo_type, signal_type) # Keeps script from checking all files in a test run. if self.test_mode: @@ -728,12 +795,11 @@ def exit(self): self.suppressed_errors.remove(raised_check_id) suppressed_counter += 1 - print(len(subset_raised_errors), "messages") - print(suppressed_counter, "suppressed messages") + print(self.total_checks, "checks run") + print(len(subset_raised_errors), "checks failed") + print(suppressed_counter, "checks suppressed") - if len(subset_raised_errors) == 0: - sys.exit(0) - else: + if len(subset_raised_errors) != 0: for message in subset_raised_errors: print(message) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 5a0b4c5e8..04da7afed 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -446,7 +446,7 @@ def test_same_val(self): test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) - validator.check_avg_val_diffs( + validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -460,7 +460,7 @@ def test_same_se(self): test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) - validator.check_avg_val_diffs( + validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -474,7 +474,7 @@ def test_same_n(self): test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) - validator.check_avg_val_diffs( + validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") assert len(validator.raised_errors) == 0 @@ -488,7 +488,7 @@ def test_10x_val(self): test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) - validator.check_avg_val_diffs( + validator.check_avg_val_vs_reference( test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") @@ -503,7 +503,7 @@ def test_100x_val(self): test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) - validator.check_avg_val_diffs( + validator.check_avg_val_vs_reference( test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") @@ -520,7 +520,7 @@ def test_1000x_val(self): test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) - validator.check_avg_val_diffs( + validator.check_avg_val_vs_reference( test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") From 1b5deacbd99c59d34d45f01760c6deed07ebf6f0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 20 Oct 2020 15:57:14 -0400 Subject: [PATCH 091/151] update plans --- validator/PLANS.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 65a45107a..cc79e8b59 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -26,14 +26,15 @@ ## Current features * Errors are summarized in class attribute and printed on exit +* If any non-suppressed errors are raised, the validation process exits with non-zero status * Various check settings are controllable via indicator-specific params.json files * User can manually disable certain checks for certain sets of data using a field in the params.json file * User can enable test mode (checks only a small number of data files) using a field in the params.json file ## Checks + features wishlist, and problems to think about: -* Improve efficiency by grouping all_frames by geo and sig instead of reading data in again via read_geo_sig_cmbo_files(). -* Check explicitly for large spikes (avg_val check can detect large jumps and especially large spikes) +* Improve efficiency by grouping all_frames by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). +* Check explicitly for large spikes (avg_val check can detect jumps in average value) * Which, if any, specific geo_ids are missing (get list from historical data) * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Use known erroneous/anomalous days of source data to tune static thresholds @@ -46,8 +47,9 @@ * E.g. WY/RI missing or very low compared to historical * Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. See [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1601307675021000?thread_ts=1600277030.103500&cid=CV1SYBC90) and [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1600978037007500?thread_ts=1600277030.103500&cid=CV1SYBC90) for more background. * Order raised exceptions by p-value - * Correct p-values for multiple testing * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (differeng geo regions, e.g.) are "close" to significant + * Correct p-values for multiple testing + * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family" * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) From 00014e6ee9cd4dd911ea322e2f4847bd1fa1fdc9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 20 Oct 2020 17:25:14 -0400 Subject: [PATCH 092/151] fix bug causing key error when geo-sig combo file is not found --- validator/delphi_validator/datafetcher.py | 3 --- validator/delphi_validator/validate.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index b32fa21f0..3c2b9605c 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -90,9 +90,6 @@ def read_geo_signal_combo_files(geo_signal_combos, data_folder, filenames, date_ in file and geo_signal_combo[1] in file] if len(files) == 0: - print("FILE_NOT_FOUND: File with geo_type:", - geo_signal_combo[0], " and signal:", geo_signal_combo[1], - " does not exist!") yield pd.DataFrame(), geo_signal_combo[0], geo_signal_combo[1] continue diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index c54985830..45f5188af 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,7 +2,7 @@ """ Tools to validate CSV source data, including various check methods. """ - +import pdb import sys import re import math @@ -725,6 +725,15 @@ def validate(self, export_dir): [name_match_pair[0] for name_match_pair in validate_files], date_slist): + self.increment_total_checks() + + if geo_sig_df.empty: + self.raised_errors.append(ValidationError( + ("check_missing_geo_sig_combo", geo_type, signal_type), + None, + "File with geo_type-signal combo does not exist!")) + continue + max_date = geo_sig_df["time_value"].max() self.check_min_allowed_max_date(max_date, geo_type, signal_type) self.check_max_allowed_max_date(max_date, geo_type, signal_type) From 1d0a7a253073e12d38c83b91f8207f05851c58be Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 20 Oct 2020 17:45:29 -0400 Subject: [PATCH 093/151] update data fetch error message --- validator/delphi_validator/datafetcher.py | 4 ++-- validator/delphi_validator/validate.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 3c2b9605c..cf7b099d8 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -119,8 +119,8 @@ def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type custom_msg = "Error fetching data from " + str(start_date) + \ " to " + str(end_date) + \ "for data source: " + data_source + \ - ", signal-type: " + signal_type + \ - ", geography-type: " + geo_type + ", signal type: " + signal_type + \ + ", geo type: " + geo_type raise APIDataFetchError(custom_msg) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 45f5188af..9666fa9ff 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,7 +2,7 @@ """ Tools to validate CSV source data, including various check methods. """ -import pdb + import sys import re import math From 41cb38fe4a67d18f541b91f3f263bd74a19199c9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 10:50:17 -0400 Subject: [PATCH 094/151] update plans --- validator/PLANS.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index cc79e8b59..c7e5c1b6a 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -36,13 +36,15 @@ * Improve efficiency by grouping all_frames by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). * Check explicitly for large spikes (avg_val check can detect jumps in average value) * Which, if any, specific geo_ids are missing (get list from historical data) -* Different test thresholds for different files? Currently some control based on smoothed vs raw signals -* Use known erroneous/anomalous days of source data to tune static thresholds * Check for duplicate rows +* Use known erroneous/anomalous days of source data to tune static thresholds +* Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values been changed significantly * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. - * Long-term trends. Currently, checks only look at a data window of a few days - * Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks) for ideas + * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days + * Any relevant anomaly detection packages already exist? + * What sorts of hypothesis tests to use? + * See data-quality GitHub issues, Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks), and Dmitry's [indicator validation notebook](https://github.com/cmu-delphi/covidcast-indicators/blob/deploy-jhu/testing_utils/indicator_validation.template.ipynb) for ideas * E.g. Doctor visits decreasing correlation with cases * E.g. WY/RI missing or very low compared to historical * Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. See [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1601307675021000?thread_ts=1600277030.103500&cid=CV1SYBC90) and [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1600978037007500?thread_ts=1600277030.103500&cid=CV1SYBC90) for more background. From 070d7c2c3ff88d3e3658f878346b3ade5ff42038 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 10:51:11 -0400 Subject: [PATCH 095/151] update plans --- validator/PLANS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index c7e5c1b6a..3c6f766d9 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -34,9 +34,9 @@ ## Checks + features wishlist, and problems to think about: * Improve efficiency by grouping all_frames by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). -* Check explicitly for large spikes (avg_val check can detect jumps in average value) * Which, if any, specific geo_ids are missing (get list from historical data) * Check for duplicate rows +* Check explicitly for large spikes (avg_val check can detect jumps in average value) * Use known erroneous/anomalous days of source data to tune static thresholds * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values been changed significantly From c93d0b34c4116aeae8b2447f6fa9520ef398c01d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 10:58:45 -0400 Subject: [PATCH 096/151] update plans --- validator/PLANS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 3c6f766d9..d0409183a 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -43,7 +43,7 @@ * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days * Any relevant anomaly detection packages already exist? - * What sorts of hypothesis tests to use? + * What sorts of hypothesis tests to use? See [time series trend analysis](https://www.genasis.cz/time-series/index.php?pg=home--trend-analysis). * See data-quality GitHub issues, Ryan’s [correlation notebook](https://github.com/cmu-delphi/covidcast/tree/main/R-notebooks), and Dmitry's [indicator validation notebook](https://github.com/cmu-delphi/covidcast-indicators/blob/deploy-jhu/testing_utils/indicator_validation.template.ipynb) for ideas * E.g. Doctor visits decreasing correlation with cases * E.g. WY/RI missing or very low compared to historical From 1822e0574ec4837fd5ab02c4d45fb87f02caa687 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 11:06:51 -0400 Subject: [PATCH 097/151] update plans --- validator/PLANS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index d0409183a..118446a3c 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -37,9 +37,9 @@ * Which, if any, specific geo_ids are missing (get list from historical data) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) +* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Use known erroneous/anomalous days of source data to tune static thresholds * Different test thresholds for different files? Currently some control based on smoothed vs raw signals -* Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values been changed significantly * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days * Any relevant anomaly detection packages already exist? From 12405a31fc99727d1493f15a08f0bc78f6025c02 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 21 Oct 2020 13:59:34 -0400 Subject: [PATCH 098/151] Update PLANS.md --- validator/PLANS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/PLANS.md b/validator/PLANS.md index 118446a3c..c2bf872b4 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -55,3 +55,4 @@ * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) +* Run timing tests, check if saving intermediate files will improve effeciency From b4ca1623a713b8b4dc8fc671413049e516be9f2c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 17:08:42 -0400 Subject: [PATCH 099/151] apply fix for check_avg_val index bug --- validator/delphi_validator/validate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 9666fa9ff..ee3b3192e 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -619,7 +619,8 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] - raw_thresholds = pd.DataFrame([1.50, 1.30, 1.80], classes).T + raw_thresholds = pd.DataFrame( + [[1.50, 1.30, 1.80] * len(df_all.index)], classes, index=df_all.index) smoothed_thresholds = raw_thresholds.apply( lambda x: x/(math.sqrt(7) * 1.5)) From 4410b99e70ebe7fbd359e393e853aa79d3295aa3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 17:22:57 -0400 Subject: [PATCH 100/151] update fix --- validator/delphi_validator/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index ee3b3192e..0bf2b40a7 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -620,7 +620,7 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] raw_thresholds = pd.DataFrame( - [[1.50, 1.30, 1.80] * len(df_all.index)], classes, index=df_all.index) + [[1.50, 1.30, 1.80]], classes, index=df_all.index) smoothed_thresholds = raw_thresholds.apply( lambda x: x/(math.sqrt(7) * 1.5)) From 74a80ec9982fac392d4de03ff2e0c07f5b3e290a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 21 Oct 2020 17:48:49 -0400 Subject: [PATCH 101/151] fix bug in check_avg_val --- validator/delphi_validator/validate.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 0bf2b40a7..f196614ee 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -620,7 +620,7 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] raw_thresholds = pd.DataFrame( - [[1.50, 1.30, 1.80]], classes, index=df_all.index) + [[1.50, 1.30, 1.80]], columns=classes, index=df_all.index) smoothed_thresholds = raw_thresholds.apply( lambda x: x/(math.sqrt(7) * 1.5)) @@ -634,11 +634,10 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. - mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).bool() or ( - (df_all["variable"] == "val").bool() and ( - abs(df_all["mean_stddiff"]) > thres["val_mean_stddiff"]).bool()) + mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).any() or ( + abs(df_all[df_all["variable"] == "val"]["mean_stddiff"]) > thres["val_mean_stddiff"]).any() mean_stdabsdiff_high = ( - df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).bool() + df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).any() if mean_stddiff_high or mean_stdabsdiff_high: self.raised_errors.append(ValidationError( @@ -646,9 +645,9 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, checking_date.date(), geo_type, signal_type), (mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data ' - + '(either semirecent or from API) seem large --- either large increase ' + + 'seem large --- either large increase ' + 'tending toward one direction or large mean absolute difference, relative ' - + 'to average values of corresponding variables. For the former check, ' + + 'to average values of corresponding variables. For the former check, ' + 'tolerances for `val` are more restrictive than those for other columns.')) self.increment_total_checks() From 2da16167b010f3687818de519a9f5c1532b9d909 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 22 Oct 2020 13:09:29 -0400 Subject: [PATCH 102/151] update state and national regex to allow capital or lowercase abbreviations --- validator/delphi_validator/validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index f196614ee..5ed73100b 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -20,8 +20,8 @@ 'county': '^\d{5}$', 'hrr': '^\d{1,3}$', 'msa': '^\d{5}$', - 'state': '^[a-z]{2}$', - 'national': '^[a-z]{2}$' + 'state': '^[a-zA-Z]{2}$', + 'national': '^[a-zA-Z]{2}$' } From 3ae75a186f4237f1cda9078d862225260f642a21 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Thu, 22 Oct 2020 14:48:58 -0400 Subject: [PATCH 103/151] Update PLANS.md --- validator/PLANS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/PLANS.md b/validator/PLANS.md index c2bf872b4..f651d1aec 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -56,3 +56,4 @@ * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) * Run timing tests, check if saving intermediate files will improve effeciency +* Ensure validator runs on signals that require AWS credentials From 4af0b1d689159d7fc48c0e5ff8036c944e57725e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 22 Oct 2020 15:04:12 -0400 Subject: [PATCH 104/151] add raised_warnings (non-upload blocking) attribute and print output. simplify exit() procedure. fix bug in check_avg_val procedure --- validator/delphi_validator/validate.py | 89 +++++++++++++++++--------- validator/tests/test_checks.py | 7 +- 2 files changed, 63 insertions(+), 33 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 5ed73100b..e8d8f645d 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -106,7 +106,9 @@ def __init__(self, params): - expected_lag: dict of signal names: int pairs; how many days behind do we expect each signal to be - suppressed_errors: set of check_data_ids used to identify error messages to ignore - - raised_errors: list to append errors to as they are raised + - raised_errors: list to append data upload-blocking errors to as they are raised + - total_checks: incremental counter to track total number of checks run + - raised_warnings: list to append non-data upload-blocking errors to as they are raised """ # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] @@ -143,6 +145,8 @@ def __init__(self, params): self.raised_errors = [] self.total_checks = 0 + self.raised_warnings = [] + def increment_total_checks(self): """ Add 1 to total_checks counter """ self.total_checks += 1 @@ -250,11 +254,28 @@ def check_bad_geo_id(self, df_to_test, nameformat, geo_type): Returns: - None """ - def find_all_unexpected_geo_ids(df_to_test, geo_regex): + def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): """ Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. """ + numeric_geo_types = {"msa", "county", "hrr"} + + if geo_type in numeric_geo_types: + # Check if geo_ids were stored as floats (contain decimal point) and + # contents before decimal match the specified regex pattern. + leftover = [geo[1] for geo in df_to_test["geo_id"].str.split( + ".") if len(geo) > 1 and re.match(geo_regex, geo[0])] + + # If any floats found, remove decimal and anything after. + if len(leftover) > 0: + df_to_test["geo_id"] = [geo[0] + for geo in df_to_test["geo_id"].str.split(".")] + + self.raised_warnings.append(ValidationError( + ("check_geo_id_type", nameformat), + None, "geo_ids saved as floats; strings preferred")) + expected_geos = [geo[0] for geo in df_to_test['geo_id'].str.findall( geo_regex) if len(geo) > 0] @@ -272,7 +293,7 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex): geo_type, "Unrecognized geo type")) else: find_all_unexpected_geo_ids( - df_to_test, geo_regex_dict[geo_type]) + df_to_test, geo_regex_dict[geo_type], geo_type) self.increment_total_checks() @@ -634,8 +655,12 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. - mean_stddiff_high = (abs(df_all["mean_stddiff"]) > thres["mean_stddiff"]).any() or ( - abs(df_all[df_all["variable"] == "val"]["mean_stddiff"]) > thres["val_mean_stddiff"]).any() + mean_stddiff_high = ( + abs(df_all["mean_stddiff"]) > thres["mean_stddiff"] + ).any() or ((df_all["variable"] == "val").any() and ( + abs(df_all[df_all["variable"] == "val"] + ["mean_stddiff"]) > thres["val_mean_stddiff"] + ).any()) mean_stdabsdiff_high = ( df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).any() @@ -787,31 +812,33 @@ def exit(self): """ If any not-suppressed exceptions were raised, print and exit with non-zero status. """ - if self.raised_errors: - suppressed_counter = 0 - subset_raised_errors = [] - - for val_error in self.raised_errors: - # Convert any dates in check_data_id to strings for the purpose of comparing - # to manually suppressed errors. - raised_check_id = tuple([ - item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) - else item for item in val_error.check_data_id]) - - if raised_check_id not in self.suppressed_errors: - subset_raised_errors.append(val_error) - else: - self.suppressed_errors.remove(raised_check_id) - suppressed_counter += 1 - - print(self.total_checks, "checks run") - print(len(subset_raised_errors), "checks failed") - print(suppressed_counter, "checks suppressed") - - if len(subset_raised_errors) != 0: - for message in subset_raised_errors: - print(message) - - sys.exit(1) + suppressed_counter = 0 + subset_raised_errors = [] + + for val_error in self.raised_errors: + # Convert any dates in check_data_id to strings for the purpose of comparing + # to manually suppressed errors. + raised_check_id = tuple([ + item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) + else item for item in val_error.check_data_id]) + + if raised_check_id not in self.suppressed_errors: + subset_raised_errors.append(val_error) + else: + self.suppressed_errors.remove(raised_check_id) + suppressed_counter += 1 + + print(self.total_checks, "checks run") + print(len(subset_raised_errors), "checks failed") + print(suppressed_counter, "checks suppressed") + print(len(self.raised_warnings), "warnings") + + for message in subset_raised_errors: + print(message) + for message in self.raised_warnings: + print(message) + + if len(subset_raised_errors) != 0: + sys.exit(1) else: sys.exit(0) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 04da7afed..173600bb9 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -212,9 +212,10 @@ def test_invalid_geo_id_state(self): assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 5 + assert len(validator.raised_errors[0].expression) == 4 assert "aa" not in validator.raised_errors[0].expression assert "hi" not in validator.raised_errors[0].expression + assert "HI" not in validator.raised_errors[0].expression def test_invalid_geo_id_national(self): validator = Validator(self.params) @@ -224,8 +225,10 @@ def test_invalid_geo_id_national(self): assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 5 + assert len(validator.raised_errors[0].expression) == 3 assert "us" not in validator.raised_errors[0].expression + assert "US" not in validator.raised_errors[0].expression + assert "SP" not in validator.raised_errors[0].expression class TestCheckBadVal: From 4bf5b3fa90d4756f5a763d7eaaf1891e93e24aa7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 22 Oct 2020 15:06:49 -0400 Subject: [PATCH 105/151] update plans --- validator/PLANS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index f651d1aec..9e64cb7e6 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -55,5 +55,6 @@ * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) -* Run timing tests, check if saving intermediate files will improve effeciency +* Run timing tests, check if saving intermediate files will improve effeciency * Ensure validator runs on signals that require AWS credentials +* Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in for loop From 02b73b6bc1c1316f977928ad57dabba2d7ad0e82 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Thu, 22 Oct 2020 15:18:34 -0400 Subject: [PATCH 106/151] Update PLANS.md --- validator/PLANS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 9e64cb7e6..adb52ee55 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -55,6 +55,6 @@ * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) -* Run timing tests, check if saving intermediate files will improve effeciency +* Run timing tests, check if saving intermediate files will improve effeciency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials * Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in for loop From f926079786ac9bf46bc296eac2ec1c4d34df22a9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 22 Oct 2020 16:31:05 -0400 Subject: [PATCH 107/151] check_avg_val fix --- validator/delphi_validator/validate.py | 16 ++++++++-------- validator/tests/test_checks.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index e8d8f645d..ae440430e 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -641,7 +641,7 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, # Set thresholds for raw and smoothed variables. classes = ['mean_stddiff', 'val_mean_stddiff', 'mean_stdabsdiff'] raw_thresholds = pd.DataFrame( - [[1.50, 1.30, 1.80]], columns=classes, index=df_all.index) + [[1.50, 1.30, 1.80]], columns=classes) smoothed_thresholds = raw_thresholds.apply( lambda x: x/(math.sqrt(7) * 1.5)) @@ -655,14 +655,14 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. - mean_stddiff_high = ( - abs(df_all["mean_stddiff"]) > thres["mean_stddiff"] - ).any() or ((df_all["variable"] == "val").any() and ( - abs(df_all[df_all["variable"] == "val"] - ["mean_stddiff"]) > thres["val_mean_stddiff"] - ).any()) + mean_stddiff_high = + (abs(df_all["mean_stddiff"]) > float(thres["mean_stddiff"])).any() or ( + (df_all["variable"] == "val").any() and + (abs(df_all[df_all["variable"] == "val"]["mean_stddiff"]) + > float(thres["val_mean_stddiff"])).any() + ) mean_stdabsdiff_high = ( - df_all["mean_stdabsdiff"] > thres["mean_stdabsdiff"]).any() + df_all["mean_stdabsdiff"] > float(thres["mean_stdabsdiff"])).any() if mean_stddiff_high or mean_stdabsdiff_high: self.raised_errors.append(ValidationError( diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 173600bb9..617b13cba 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -482,6 +482,20 @@ def test_same_n(self): assert len(validator.raised_errors) == 0 + def test_same_val_se_n(self): + validator = Validator(self.params) + + data = {"val": [1, 1, 1, 2, 0, 1], "se": [1, 1, 1, 2, 0, 1], + "sample_size": [1, 1, 1, 2, 0, 1], "geo_id": ["1"] * 6} + + test_df = pd.DataFrame(data) + ref_df = pd.DataFrame(data) + + validator.check_avg_val_vs_reference( + test_df, ref_df, date.today(), "geo", "signal") + + assert len(validator.raised_errors) == 0 + def test_10x_val(self): validator = Validator(self.params) test_data = {"val": [1, 1, 1, 20, 0, 1], "se": [np.nan] * 6, From c09c5a0c67340fe35bb40888e2de5c78cfe68491 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 22 Oct 2020 17:07:31 -0400 Subject: [PATCH 108/151] formatting --- validator/delphi_validator/validate.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index ae440430e..1d97bf1bf 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -655,11 +655,11 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, thres = switcher.get(smooth_option, lambda: "Invalid smoothing option") # Check if the calculated mean differences are high compared to the thresholds. - mean_stddiff_high = - (abs(df_all["mean_stddiff"]) > float(thres["mean_stddiff"])).any() or ( - (df_all["variable"] == "val").any() and - (abs(df_all[df_all["variable"] == "val"]["mean_stddiff"]) - > float(thres["val_mean_stddiff"])).any() + mean_stddiff_high = ( + abs(df_all["mean_stddiff"]) > float(thres["mean_stddiff"])).any() or ( + (df_all["variable"] == "val").any() and + (abs(df_all[df_all["variable"] == "val"]["mean_stddiff"]) + > float(thres["val_mean_stddiff"])).any() ) mean_stdabsdiff_high = ( df_all["mean_stdabsdiff"] > float(thres["mean_stdabsdiff"])).any() From 014335fe0c2909f5bdcc84874ae18bc758d3eb51 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 10:29:58 -0400 Subject: [PATCH 109/151] update plans --- validator/PLANS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index adb52ee55..6ab5c10ca 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -38,6 +38,7 @@ * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. +* Run check_missing_dates on every geo type-signal type separately. Probably move check to geo_sig loop. * Use known erroneous/anomalous days of source data to tune static thresholds * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. @@ -57,4 +58,4 @@ * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) * Run timing tests, check if saving intermediate files will improve effeciency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials -* Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in for loop +* Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checing_date loop From cfbda437f33243a238324adca63bded919a64f01 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 17:20:43 -0400 Subject: [PATCH 110/151] update plans --- validator/PLANS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/PLANS.md b/validator/PLANS.md index 6ab5c10ca..6bef099d5 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -37,6 +37,7 @@ * Which, if any, specific geo_ids are missing (get list from historical data) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) +* Add regex option in `geo_regex_dict` for [DMA regions](https://www.nielsen.com/us/en/intl-campaigns/dma-maps/), used in GHT. * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Run check_missing_dates on every geo type-signal type separately. Probably move check to geo_sig loop. * Use known erroneous/anomalous days of source data to tune static thresholds From 183b130131c9139984f4d817cafd98887e092e08 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 17:23:13 -0400 Subject: [PATCH 111/151] add dma to geo type options --- validator/delphi_validator/validate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 1d97bf1bf..4bf6979ba 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -20,6 +20,7 @@ 'county': '^\d{5}$', 'hrr': '^\d{1,3}$', 'msa': '^\d{5}$', + 'dma': '^\d{3}$', 'state': '^[a-zA-Z]{2}$', 'national': '^[a-zA-Z]{2}$' } From 9d4e00865593b124ea23bacfe12ea8e1d2c9f2cb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 17:25:34 -0400 Subject: [PATCH 112/151] allow error messages to include variable values --- validator/delphi_validator/validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 4bf6979ba..aeb0079f1 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -440,7 +440,7 @@ def check_bad_sample_size(self, df_to_test, nameformat): if not result.empty: self.raised_errors.append(ValidationError( ("check_n_gt_min", nameformat), - result, "sample size must be >= {self.minimum_sample_size}")) + result, f"sample size must be >= {self.minimum_sample_size}")) self.increment_total_checks() @@ -452,7 +452,7 @@ def check_bad_sample_size(self, df_to_test, nameformat): self.raised_errors.append(ValidationError( ("check_n_missing_or_gt_min", nameformat), result, - "sample size must be NA or >= {self.minimum_sample_size}")) + f"sample size must be NA or >= {self.minimum_sample_size}")) self.increment_total_checks() From b2cdc955c3e0f3487cb6babfeae4e2d1a32f47e3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 17:26:09 -0400 Subject: [PATCH 113/151] update plans --- validator/PLANS.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 6bef099d5..08be94ca6 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -37,7 +37,6 @@ * Which, if any, specific geo_ids are missing (get list from historical data) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) -* Add regex option in `geo_regex_dict` for [DMA regions](https://www.nielsen.com/us/en/intl-campaigns/dma-maps/), used in GHT. * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Run check_missing_dates on every geo type-signal type separately. Probably move check to geo_sig loop. * Use known erroneous/anomalous days of source data to tune static thresholds @@ -57,6 +56,6 @@ * Nicer formatting for error “report” * Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first * Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) -* Run timing tests, check if saving intermediate files will improve effeciency (currently a bottleneck at "individual file checks" section) +* Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials -* Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checing_date loop +* Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checking_date loop From 7d47001da87977f49a55fc77153cd9768c7d786c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 17:29:45 -0400 Subject: [PATCH 114/151] update plans --- validator/PLANS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 08be94ca6..c2c299f70 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -34,6 +34,8 @@ ## Checks + features wishlist, and problems to think about: * Improve efficiency by grouping all_frames by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). +* Fix deprecated signals being tested for checking_dates after their deprecation date + * covidcast.metadata includes flag for deprecation? * Which, if any, specific geo_ids are missing (get list from historical data) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) @@ -54,8 +56,6 @@ * Correct p-values for multiple testing * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family" * Nicer formatting for error “report” -* Have separate error report sections for data validation checks (which are boolean) and statistical checks, where we want to present the most serious and significant issues first -* Statistical/anomaly checks should be included in the error report but should not block source data upload (i.e. not cause non-zero exit status) * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials * Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checking_date loop From 3d27e29bbf82cd71274db8d9246c7f1b4a840510 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 18:13:19 -0400 Subject: [PATCH 115/151] update plans --- validator/PLANS.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index c2c299f70..f7a940880 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -35,7 +35,7 @@ * Improve efficiency by grouping all_frames by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). * Fix deprecated signals being tested for checking_dates after their deprecation date - * covidcast.metadata includes flag for deprecation? + * Does covidcast.metadata include flag for deprecation? * Which, if any, specific geo_ids are missing (get list from historical data) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) @@ -59,3 +59,7 @@ * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials * Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checking_date loop +* Combination indicator and jhu running into divide-by-zero error in relative_difference_by_min +* Don't want APIDataFetchError to stop validation. + * Wrap API fetch in try catch, add APIDataFetchError to raised_errors and use previous loop's version of reference_data to do checks? + * Wrap API fetch in try catch, and leave current loop if unable to perform checks? From 0fc164f041f2b58682a3972d6d581154fdaaf744 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 18:26:50 -0400 Subject: [PATCH 116/151] catch APIDataFetchError and add to list of errors, aborting the comparative checks for that set of data. Check if reference data is empty and, if so, abort comparative checks for that set of data --- validator/delphi_validator/validate.py | 30 +++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index aeb0079f1..ea73bf3df 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -786,9 +786,33 @@ def validate(self, export_dir): reference_start_date = checking_date - \ min(semirecent_lookbehind, self.max_check_lookbehind) reference_end_date = recent_cutoff_date - timedelta(days=1) - reference_api_df = fetch_api_reference( - self.data_source, reference_start_date, reference_end_date, - geo_type, signal_type) + + try: + reference_api_df = fetch_api_reference( + self.data_source, reference_start_date, reference_end_date, + geo_type, signal_type) + except APIDataFetchError as e: + self.increment_total_checks() + self.raised_errors.append(ValidationError( + ("api_data_fetch_error", + checking_date, geo_type, signal_type), None, e)) + + self.increment_total_checks() + self.raised_errors.append(ValidationError( + ("missing_reference_data", + checking_date, geo_type, signal_type), None, + "reference data is unavailable; comparative checks could not be performed")) + + continue + + if reference_api_df.empty: + self.increment_total_checks() + self.raised_errors.append(ValidationError( + ("empty_reference_data", + checking_date, geo_type, signal_type), None, + "reference data is empty; comparative checks could not be performed")) + + continue self.check_max_date_vs_reference( recent_df, reference_api_df, checking_date, geo_type, signal_type) From 03df082aaa8f15c0e9bf6f1e68b4041b741ffe3f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 23 Oct 2020 18:40:00 -0400 Subject: [PATCH 117/151] update plans --- validator/PLANS.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index f7a940880..4e2fd2a1d 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -57,9 +57,8 @@ * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family" * Nicer formatting for error “report” * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) -* Ensure validator runs on signals that require AWS credentials +* Ensure validator runs on signals that require AWS credentials (in progress) * Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checking_date loop * Combination indicator and jhu running into divide-by-zero error in relative_difference_by_min -* Don't want APIDataFetchError to stop validation. - * Wrap API fetch in try catch, add APIDataFetchError to raised_errors and use previous loop's version of reference_data to do checks? - * Wrap API fetch in try catch, and leave current loop if unable to perform checks? +* If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older + * Currently, any API fetch problems just doesn't do comparative checks at all. From cd367fb19eab0beb0930df28981b25dada6519e5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 26 Oct 2020 17:12:31 -0400 Subject: [PATCH 118/151] update plans. Import new error type --- validator/PLANS.md | 2 +- validator/delphi_validator/validate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 4e2fd2a1d..a2db92237 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -54,7 +54,7 @@ * Order raised exceptions by p-value * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (differeng geo regions, e.g.) are "close" to significant * Correct p-values for multiple testing - * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family" + * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family"; [comparison of the two](https://delphi-org.slack.com/archives/D01A9KNTPKL/p1603294915000500) * Nicer formatting for error “report” * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials (in progress) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index ea73bf3df..30a8a6d50 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -10,7 +10,7 @@ from datetime import date, datetime, timedelta import pandas as pd -from .errors import ValidationError +from .errors import ValidationError, APIDataFetchError from .datafetcher import filename_regex, \ read_filenames, load_csv, get_geo_signal_combos, \ read_geo_signal_combo_files, fetch_api_reference From 40d7f639705a14dbe99b83b10fcca652957606d1 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 26 Oct 2020 17:45:29 -0400 Subject: [PATCH 119/151] Update PLANS.md --- validator/PLANS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/PLANS.md b/validator/PLANS.md index a2db92237..ceb920bb6 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -62,3 +62,4 @@ * Combination indicator and jhu running into divide-by-zero error in relative_difference_by_min * If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older * Currently, any API fetch problems just doesn't do comparative checks at all. +* Potentially implement a check for erratic data sources that wrongly report all 0's (like the error with the Wisconsin data for the 10/26 forecasts) From cee585beac005a48e15779704cab31c2c62a9fe5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 26 Oct 2020 18:23:30 -0400 Subject: [PATCH 120/151] add try block toprint useful info if ZeroDivsionError is raised. change definition of recent_cutoff_date to prevent errors if most recent date is not availabe --- validator/delphi_validator/validate.py | 29 ++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 30a8a6d50..c3afd30ff 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,7 +2,7 @@ """ Tools to validate CSV source data, including various check methods. """ - +import pdb import sys import re import math @@ -551,9 +551,13 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date reference_rows_per_reporting_day = df_to_reference.shape[0] / len( set(df_to_reference["time_value"])) - compare_rows = relative_difference_by_min( - test_rows_per_reporting_day, - reference_rows_per_reporting_day) + try: + compare_rows = relative_difference_by_min( + test_rows_per_reporting_day, + reference_rows_per_reporting_day) + except ZeroDivisionError as e: + print(checking_date, geo_type, signal_type) + raise e if abs(compare_rows) > 0.35: self.raised_errors.append(ValidationError( @@ -722,7 +726,7 @@ def validate(self, export_dir): all_frames = pd.concat(all_frames) - # Get list of dates we expect to see in the source data. + # Get list of dates seen in the source data. date_slist = all_frames['date'].unique().tolist() date_list = list( map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) @@ -744,7 +748,8 @@ def validate(self, export_dir): kroc = 0 # Comparison checks - # Run checks for recent dates in each geo-sig combo vs semirecent (last week) API data. + # Run checks for recent dates in each geo-sig combo vs semirecent (last week) + # API data. for geo_sig_df, geo_type, signal_type in read_geo_signal_combo_files( geo_signal_combos, export_dir, @@ -764,10 +769,11 @@ def validate(self, export_dir): self.check_min_allowed_max_date(max_date, geo_type, signal_type) self.check_max_allowed_max_date(max_date, geo_type, signal_type) - # Check data from a group of dates against recent (previous 7 days, by default) - # data from the API. + # Check data from a group of dates against recent (previous 7 days, + # by default) data from the API. for checking_date in date_list: - recent_cutoff_date = checking_date - recent_lookbehind + recent_cutoff_date = checking_date - \ + recent_lookbehind + timedelta(days=1) recent_df = geo_sig_df.query( 'time_value <= @checking_date & time_value >= @recent_cutoff_date') @@ -778,8 +784,9 @@ def validate(self, export_dir): ("check_missing_geo_sig_date_combo", checking_date, geo_type, signal_type), None, - "Test data for a given checking date-geo type-signal type" - + " combination is missing")) + "test data for a given checking date-geo type-signal type" + + " combination is missing. Source data may be missing" + + " for one or more dates")) continue # Reference dataframe runs backwards from the checking_date From b3a2b9cae3e5c8b8002337d0f49386ed9737a9ac Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 26 Oct 2020 18:25:12 -0400 Subject: [PATCH 121/151] remove pdb import --- validator/delphi_validator/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index c3afd30ff..f26180d3a 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,7 +2,7 @@ """ Tools to validate CSV source data, including various check methods. """ -import pdb + import sys import re import math From 1bfd508e59e3e0b80b3cb6f29c1cd605315247fd Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 27 Oct 2020 14:54:22 -0400 Subject: [PATCH 122/151] update plans --- validator/PLANS.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index ceb920bb6..c8625daeb 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -33,15 +33,17 @@ ## Checks + features wishlist, and problems to think about: -* Improve efficiency by grouping all_frames by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). -* Fix deprecated signals being tested for checking_dates after their deprecation date - * Does covidcast.metadata include flag for deprecation? -* Which, if any, specific geo_ids are missing (get list from historical data) +* Improve performance and reduce runtime + * General profiling + * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) + * Group `all_frames` by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). A MultiIndex dataframe may improve performance even more. + * Pull all checking_date ranges from API at once and subset checking_date loop +* Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Run check_missing_dates on every geo type-signal type separately. Probably move check to geo_sig loop. -* Use known erroneous/anomalous days of source data to tune static thresholds +* Use known erroneous/anomalous days of source data to tune static thresholds and test behavior * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days @@ -52,14 +54,11 @@ * E.g. WY/RI missing or very low compared to historical * Use hypothesis testing p-values to decide when to raise error or not, instead of static thresholds. Many low but non-significant p-values will also raise error. See [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1601307675021000?thread_ts=1600277030.103500&cid=CV1SYBC90) and [here](https://delphi-org.slack.com/archives/CV1SYBC90/p1600978037007500?thread_ts=1600277030.103500&cid=CV1SYBC90) for more background. * Order raised exceptions by p-value - * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (differeng geo regions, e.g.) are "close" to significant + * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (different geo regions, e.g.) are "close" to significant * Correct p-values for multiple testing * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family"; [comparison of the two](https://delphi-org.slack.com/archives/D01A9KNTPKL/p1603294915000500) * Nicer formatting for error “report” -* Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Ensure validator runs on signals that require AWS credentials (in progress) -* Improve efficiency and reduce load on API by pulling all checking_date ranges from API at once and subsetting in checking_date loop -* Combination indicator and jhu running into divide-by-zero error in relative_difference_by_min * If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older * Currently, any API fetch problems just doesn't do comparative checks at all. * Potentially implement a check for erratic data sources that wrongly report all 0's (like the error with the Wisconsin data for the 10/26 forecasts) From b0756af0eaefa2d0661930d400b46c4e4c917c97 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 27 Oct 2020 14:57:53 -0400 Subject: [PATCH 123/151] update plans --- validator/PLANS.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index c8625daeb..1f8f0a6bc 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -25,10 +25,10 @@ ## Current features -* Errors are summarized in class attribute and printed on exit +* Errors and warnings are summarized in class attribute and printed on exit * If any non-suppressed errors are raised, the validation process exits with non-zero status * Various check settings are controllable via indicator-specific params.json files -* User can manually disable certain checks for certain sets of data using a field in the params.json file +* User can manually disable specific checks for specific datasets using a field in the params.json file * User can enable test mode (checks only a small number of data files) using a field in the params.json file ## Checks + features wishlist, and problems to think about: @@ -57,7 +57,8 @@ * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (different geo regions, e.g.) are "close" to significant * Correct p-values for multiple testing * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family"; [comparison of the two](https://delphi-org.slack.com/archives/D01A9KNTPKL/p1603294915000500) -* Nicer formatting for error “report” +* Nicer formatting for error “report”. E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each +* Easier suppression of many errors * Ensure validator runs on signals that require AWS credentials (in progress) * If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older * Currently, any API fetch problems just doesn't do comparative checks at all. From 02d0e247a2a1c10c6c7fdcd852de0c885c1313ff Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 28 Oct 2020 12:12:20 -0400 Subject: [PATCH 124/151] update plans --- validator/PLANS.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 1f8f0a6bc..37dfc2658 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -34,10 +34,11 @@ ## Checks + features wishlist, and problems to think about: * Improve performance and reduce runtime - * General profiling + * Profiling (iterate) * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Group `all_frames` by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). A MultiIndex dataframe may improve performance even more. * Pull all checking_date ranges from API at once and subset checking_date loop + * Parallelize? * Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) @@ -57,9 +58,11 @@ * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (different geo regions, e.g.) are "close" to significant * Correct p-values for multiple testing * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family"; [comparison of the two](https://delphi-org.slack.com/archives/D01A9KNTPKL/p1603294915000500) -* Nicer formatting for error “report”. E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each -* Easier suppression of many errors -* Ensure validator runs on signals that require AWS credentials (in progress) +* Nicer formatting for error “report”. + * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each +* Easier suppression of many errors at once +* Ensure validator runs on signals that require AWS credentials (iterate) +* Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive * If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older * Currently, any API fetch problems just doesn't do comparative checks at all. * Potentially implement a check for erratic data sources that wrongly report all 0's (like the error with the Wisconsin data for the 10/26 forecasts) From 396b76d42e7cf8077bad900bc6c383d858a7cc7f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 30 Oct 2020 12:08:56 -0400 Subject: [PATCH 125/151] fix missing leading zeroes in numeric geo_id if previously saved as float or int --- validator/delphi_validator/validate.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index f26180d3a..0f93741c5 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -260,7 +260,8 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): Check if any geo_ids in df_to_test aren't formatted correctly, according to the geo type dictionary negated_regex_dict. """ - numeric_geo_types = {"msa", "county", "hrr"} + numeric_geo_types = {"msa", "county", "hrr", "dma"} + fill_len = {"msa": 5, "county": 5, "dma": 3} if geo_type in numeric_geo_types: # Check if geo_ids were stored as floats (contain decimal point) and @@ -277,6 +278,12 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): ("check_geo_id_type", nameformat), None, "geo_ids saved as floats; strings preferred")) + if geo_type in fill_len.keys(): + # Left-pad with zeroes up to expected length. Fixes missing leading zeroes + # caused by FIPS codes saved as numeric. + df_to_test["geo_id"] = [geo.zfill(fill_len["geo_type"]) + for geo in df_to_test["geo_id"]] + expected_geos = [geo[0] for geo in df_to_test['geo_id'].str.findall( geo_regex) if len(geo) > 0] From dd4c76d702e08e07fb6fb7e61ce62afb3182e729 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 30 Oct 2020 12:13:27 -0400 Subject: [PATCH 126/151] update plans. Fix zfill KeyError --- validator/PLANS.md | 2 +- validator/delphi_validator/validate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 37dfc2658..b91c5ef82 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -37,7 +37,7 @@ * Profiling (iterate) * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Group `all_frames` by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). A MultiIndex dataframe may improve performance even more. - * Pull all checking_date ranges from API at once and subset checking_date loop + * Reduce number of/combine API calls. Pull all checking_date ranges from API at once and subset checking_date loop * Parallelize? * Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) * Check for duplicate rows diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 0f93741c5..02db89f9d 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -281,7 +281,7 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): if geo_type in fill_len.keys(): # Left-pad with zeroes up to expected length. Fixes missing leading zeroes # caused by FIPS codes saved as numeric. - df_to_test["geo_id"] = [geo.zfill(fill_len["geo_type"]) + df_to_test["geo_id"] = [geo.zfill(fill_len[geo_type]) for geo in df_to_test["geo_id"]] expected_geos = [geo[0] for geo in df_to_test['geo_id'].str.findall( From fd99f224617458deea3b7c6987711ab4695e0d86 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 30 Oct 2020 15:25:45 -0400 Subject: [PATCH 127/151] combine API calls. update tets and plans --- validator/PLANS.md | 2 +- validator/delphi_validator/validate.py | 35 +++++++++++++------------- validator/tests/test_checks.py | 8 +++--- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index b91c5ef82..b78f18058 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -37,7 +37,7 @@ * Profiling (iterate) * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Group `all_frames` by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). A MultiIndex dataframe may improve performance even more. - * Reduce number of/combine API calls. Pull all checking_date ranges from API at once and subset checking_date loop + * Reduce number of API calls by combining. * Parallelize? * Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) * Check for duplicate rows diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 02db89f9d..d195a41e2 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -772,10 +772,25 @@ def validate(self, export_dir): "File with geo_type-signal combo does not exist!")) continue + min_date = geo_sig_df["time_value"].min() max_date = geo_sig_df["time_value"].max() self.check_min_allowed_max_date(max_date, geo_type, signal_type) self.check_max_allowed_max_date(max_date, geo_type, signal_type) + # Pull relevant reference data from API for all dates. + try: + geo_sig_api_df = fetch_api_reference( + self.data_source, + min_date - min(semirecent_lookbehind, + self.max_check_lookbehind), + max_date, geo_type, signal_type) + except APIDataFetchError as e: + self.increment_total_checks() + self.raised_errors.append(ValidationError( + ("api_data_fetch_error", geo_type, signal_type), None, e)) + + continue + # Check data from a group of dates against recent (previous 7 days, # by default) data from the API. for checking_date in date_list: @@ -801,23 +816,9 @@ def validate(self, export_dir): min(semirecent_lookbehind, self.max_check_lookbehind) reference_end_date = recent_cutoff_date - timedelta(days=1) - try: - reference_api_df = fetch_api_reference( - self.data_source, reference_start_date, reference_end_date, - geo_type, signal_type) - except APIDataFetchError as e: - self.increment_total_checks() - self.raised_errors.append(ValidationError( - ("api_data_fetch_error", - checking_date, geo_type, signal_type), None, e)) - - self.increment_total_checks() - self.raised_errors.append(ValidationError( - ("missing_reference_data", - checking_date, geo_type, signal_type), None, - "reference data is unavailable; comparative checks could not be performed")) - - continue + # Subset API data to relevant range of dates. + reference_api_df = geo_sig_api_df.query( + "time_value <= @reference_start_date & time_value >= @reference_end_date") if reference_api_df.empty: self.increment_total_checks() diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 617b13cba..c8e199f2d 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -152,14 +152,14 @@ class TestCheckBadGeoId: def test_empty_df(self): validator = Validator(self.params) - empty_df = pd.DataFrame(columns=["geo_id"]) + empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id(empty_df, "name", "county") assert len(validator.raised_errors) == 0 def test_invalid_geo_type(self): validator = Validator(self.params) - empty_df = pd.DataFrame(columns=["geo_id"]) + empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id(empty_df, "name", "hello") assert len(validator.raised_errors) == 1 @@ -177,7 +177,7 @@ def test_invalid_geo_id_county(self): assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 4 + assert len(validator.raised_errors[0].expression) == 2 assert "54321" not in validator.raised_errors[0].expression def test_invalid_geo_id_msa(self): @@ -188,7 +188,7 @@ def test_invalid_geo_id_msa(self): assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 4 + assert len(validator.raised_errors[0].expression) == 2 assert "54321" not in validator.raised_errors[0].expression def test_invalid_geo_id_hrr(self): From be920b1516481ca0cb492ede1f46778335cb1a05 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 2 Nov 2020 12:18:55 -0500 Subject: [PATCH 128/151] thread API calls. update plans --- validator/PLANS.md | 2 - validator/delphi_validator/validate.py | 79 +++++++++++++++++++++----- validator/tests/test_checks.py | 3 +- 3 files changed, 68 insertions(+), 16 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index b78f18058..90f4c2739 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -37,8 +37,6 @@ * Profiling (iterate) * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) * Group `all_frames` by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). A MultiIndex dataframe may improve performance even more. - * Reduce number of API calls by combining. - * Parallelize? * Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index d195a41e2..125a6c04b 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -6,6 +6,7 @@ import sys import re import math +import threading from os.path import join from datetime import date, datetime, timedelta import pandas as pd @@ -750,6 +751,9 @@ def validate(self, export_dir): # Get all expected combinations of geo_type and signal. geo_signal_combos = get_geo_signal_combos(self.data_source) + all_api_df = self.threaded_api_calls( + min(date_list), max(date_list), semirecent_lookbehind, geo_signal_combos) + # Keeps script from checking all files in a test run. if self.test_mode: kroc = 0 @@ -769,26 +773,17 @@ def validate(self, export_dir): self.raised_errors.append(ValidationError( ("check_missing_geo_sig_combo", geo_type, signal_type), None, - "File with geo_type-signal combo does not exist!")) + "file with geo_type-signal combo does not exist")) continue - min_date = geo_sig_df["time_value"].min() max_date = geo_sig_df["time_value"].max() self.check_min_allowed_max_date(max_date, geo_type, signal_type) self.check_max_allowed_max_date(max_date, geo_type, signal_type) - # Pull relevant reference data from API for all dates. - try: - geo_sig_api_df = fetch_api_reference( - self.data_source, - min_date - min(semirecent_lookbehind, - self.max_check_lookbehind), - max_date, geo_type, signal_type) - except APIDataFetchError as e: - self.increment_total_checks() - self.raised_errors.append(ValidationError( - ("api_data_fetch_error", geo_type, signal_type), None, e)) + # Get relevant reference data from API dictionary. + geo_sig_api_df = all_api_df[(geo_type, signal_type)] + if geo_sig_api_df is None: continue # Check data from a group of dates against recent (previous 7 days, @@ -848,6 +843,64 @@ def validate(self, export_dir): self.exit() + def get_one_api_df(self, min_date, max_date, semirecent_lookbehind, + geo_type, signal_type, dict_lock, output_dict): + """ + Pull API data for a single geo type-signal combination. Raises + error if data couldn't be retrieved. Saves data to data dict. + """ + # Pull reference data from API for all dates. + try: + geo_sig_api_df = fetch_api_reference( + self.data_source, + min_date - min(semirecent_lookbehind, + self.max_check_lookbehind), + max_date, geo_type, signal_type) + + except APIDataFetchError as e: + self.increment_total_checks() + self.raised_errors.append(ValidationError( + ("api_data_fetch_error", geo_type, signal_type), None, e)) + + geo_sig_api_df = None + + # Use a lock so only one thread can access the dictionary. + dict_lock.acquire() + output_dict[(geo_type, signal_type)] = geo_sig_api_df + dict_lock.release() + + def threaded_api_calls(self, min_date, max_date, semirecent_lookbehind, + geo_signal_combos, n_threads=32): + """ + Get data from API for all geo-signal combinations in a threaded way + to save time. + """ + if n_threads > 32: + n_threads = 32 + print("Warning: Don't run more than 32 threads at once due " + + "to API resource limitations") + + output_dict = dict() + dict_lock = threading.Lock() + + thread_objs = [threading.Thread( + target=self.get_one_api_df, args=(min_date, max_date, + semirecent_lookbehind, + geo_type, signal_type, + dict_lock, output_dict) + ) for geo_type, signal_type in geo_signal_combos] + + for i in range(len(geo_signal_combos) // n_threads + 1): + # Start subset of threads. + for thread in thread_objs[n_threads * i:n_threads * (i + 1)]: + thread.start() + + # Wait until all threads in subset are finished. + for thread in thread_objs[n_threads * i:n_threads * (i + 1)]: + thread.join() + + return output_dict + def exit(self): """ If any not-suppressed exceptions were raised, print and exit with non-zero status. diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index c8e199f2d..c3b0d63dc 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -2,7 +2,7 @@ from datetime import date, datetime, timedelta import numpy as np import pandas as pd - +import pdb from delphi_validator.datafetcher import filename_regex from delphi_validator.validate import Validator, make_date_filter @@ -153,6 +153,7 @@ class TestCheckBadGeoId: def test_empty_df(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) + pdb.set_trace() validator.check_bad_geo_id(empty_df, "name", "county") assert len(validator.raised_errors) == 0 From 213fa60972b6432de775f748e30102d2aab34273 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 2 Nov 2020 14:50:52 -0500 Subject: [PATCH 129/151] use semaphore to improve API threading. Fix error in reference data subsetting. --- validator/delphi_validator/validate.py | 30 +++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 125a6c04b..90c62b54b 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -806,14 +806,15 @@ def validate(self, export_dir): + " for one or more dates")) continue - # Reference dataframe runs backwards from the checking_date - reference_start_date = checking_date - \ - min(semirecent_lookbehind, self.max_check_lookbehind) + # Reference dataframe runs backwards from the recent_cutoff_date + reference_start_date = recent_cutoff_date - \ + min(semirecent_lookbehind, self.max_check_lookbehind) - \ + timedelta(days=1) reference_end_date = recent_cutoff_date - timedelta(days=1) # Subset API data to relevant range of dates. reference_api_df = geo_sig_api_df.query( - "time_value <= @reference_start_date & time_value >= @reference_end_date") + "time_value >= @reference_start_date & time_value <= @reference_end_date") if reference_api_df.empty: self.increment_total_checks() @@ -844,11 +845,13 @@ def validate(self, export_dir): self.exit() def get_one_api_df(self, min_date, max_date, semirecent_lookbehind, - geo_type, signal_type, dict_lock, output_dict): + geo_type, signal_type, api_semaphore, dict_lock, output_dict): """ Pull API data for a single geo type-signal combination. Raises error if data couldn't be retrieved. Saves data to data dict. """ + api_semaphore.acquire() + # Pull reference data from API for all dates. try: geo_sig_api_df = fetch_api_reference( @@ -864,6 +867,8 @@ def get_one_api_df(self, min_date, max_date, semirecent_lookbehind, geo_sig_api_df = None + api_semaphore.release() + # Use a lock so only one thread can access the dictionary. dict_lock.acquire() output_dict[(geo_type, signal_type)] = geo_sig_api_df @@ -882,22 +887,23 @@ def threaded_api_calls(self, min_date, max_date, semirecent_lookbehind, output_dict = dict() dict_lock = threading.Lock() + api_semaphore = threading.Semaphore(value=n_threads) thread_objs = [threading.Thread( target=self.get_one_api_df, args=(min_date, max_date, semirecent_lookbehind, geo_type, signal_type, + api_semaphore, dict_lock, output_dict) ) for geo_type, signal_type in geo_signal_combos] - for i in range(len(geo_signal_combos) // n_threads + 1): - # Start subset of threads. - for thread in thread_objs[n_threads * i:n_threads * (i + 1)]: - thread.start() + # Start all threads. + for thread in thread_objs: + thread.start() - # Wait until all threads in subset are finished. - for thread in thread_objs[n_threads * i:n_threads * (i + 1)]: - thread.join() + # Wait until all threads are finished. + for thread in thread_objs: + thread.join() return output_dict From 6622beecbc811f444c79bdd9f9e34477517e5f8f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 2 Nov 2020 18:15:58 -0500 Subject: [PATCH 130/151] replace read_geo_signal_combo_files() by querying all-frames. Remove unused vars. Simplify API thread methods. Update plans --- validator/PLANS.md | 8 +-- validator/delphi_validator/datafetcher.py | 42 -------------- validator/delphi_validator/validate.py | 69 ++++++++++++----------- validator/tests/test_checks.py | 3 +- 4 files changed, 40 insertions(+), 82 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 90f4c2739..84ab0de0f 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -33,16 +33,15 @@ ## Checks + features wishlist, and problems to think about: -* Improve performance and reduce runtime +* Improve performance and reduce runtime (what's the goal?) * Profiling (iterate) - * Run timing tests, check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) - * Group `all_frames` by geo type and signal name instead of reading data in again via read_geo_signal_combo_files(). A MultiIndex dataframe may improve performance even more. + * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) + * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance. * Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Run check_missing_dates on every geo type-signal type separately. Probably move check to geo_sig loop. -* Use known erroneous/anomalous days of source data to tune static thresholds and test behavior * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days @@ -60,6 +59,7 @@ * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each * Easier suppression of many errors at once * Ensure validator runs on signals that require AWS credentials (iterate) +* Use known erroneous/anomalous days of source data to tune static thresholds and test behavior * Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive * If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older * Currently, any API fetch problems just doesn't do comparative checks at all. diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index cf7b099d8..3fe71d4e9 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -65,48 +65,6 @@ def get_geo_signal_combos(data_source): return geo_signal_combos -def read_geo_signal_combo_files(geo_signal_combos, data_folder, filenames, date_slist): - """ - Generator that assembles data within the specified date range for a given geo_signal_combo. - - Arguments: - - geo_signal_combos: list of geo type-signal type combinations that we expect to see, - based on combinations reported available by COVIDcast metadata - - data_folder: path to the directory containing CSV data files. - - filenames: list of filenames - - date_slist: list of dates (formatted as strings) to check - - Returns: - - dataframe containing data for all dates in date_slist for a given - geo type-signal type combination - - relevant geo type (str) - - relevant signal type (str) - """ - for geo_signal_combo in geo_signal_combos: - df_list = list() - - # Get all filenames for this geo_type and signal_type - files = [file for file in filenames if geo_signal_combo[0] - in file and geo_signal_combo[1] in file] - - if len(files) == 0: - yield pd.DataFrame(), geo_signal_combo[0], geo_signal_combo[1] - continue - - # Load data from all found files. - for file in files: - data_df = load_csv(join(data_folder, file)) - for date in date_slist: - - # Add data's date, from CSV name, as new column - if file.find(date) != -1: - source_date = datetime.strptime(date, '%Y%m%d') - data_df['time_value'] = source_date - df_list.append(data_df) - - yield pd.concat(df_list), geo_signal_combo[0], geo_signal_combo[1] - - def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type): """ Get and process API data for use as a reference. Formatting is changed diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 90c62b54b..a56cfe4c4 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,7 +2,6 @@ """ Tools to validate CSV source data, including various check methods. """ - import sys import re import math @@ -14,7 +13,7 @@ from .errors import ValidationError, APIDataFetchError from .datafetcher import filename_regex, \ read_filenames, load_csv, get_geo_signal_combos, \ - read_geo_signal_combo_files, fetch_api_reference + fetch_api_reference # Recognized geo types. geo_regex_dict = { @@ -116,10 +115,10 @@ def __init__(self, params): self.data_source = params['data_source'] # Date/time settings - span_length = timedelta(days=params['span_length']) + self.span_length = timedelta(days=params['span_length']) self.end_date = date.today() if params['end_date'] == "latest" else datetime.strptime( params['end_date'], '%Y-%m-%d').date() - self.start_date = self.end_date - span_length + self.start_date = self.end_date - self.span_length self.generation_date = date.today() # General options: flags, thresholds @@ -153,7 +152,7 @@ def increment_total_checks(self): """ Add 1 to total_checks counter """ self.total_checks += 1 - def check_missing_dates(self, daily_filenames): + def check_missing_date_files(self, daily_filenames): """ Check for missing dates between the specified start and end dates. @@ -421,6 +420,9 @@ def check_bad_se(self, df_to_test, nameformat): self.increment_total_checks() + # Remove se_upper_limit column. + df_to_test.drop(columns=["se_upper_limit"]) + def check_bad_sample_size(self, df_to_test, nameformat): """ Check sample sizes for validity. @@ -483,7 +485,7 @@ def check_min_allowed_max_date(self, max_date, geo_type, signal_type): if max_date < self.generation_date - thres: self.raised_errors.append(ValidationError( ("check_min_max_date", geo_type, signal_type), - max_date.date(), + max_date, "date of most recent generated file seems too long ago")) self.increment_total_checks() @@ -503,7 +505,7 @@ def check_max_allowed_max_date(self, max_date, geo_type, signal_type): if max_date > self.generation_date: self.raised_errors.append(ValidationError( ("check_max_max_date", geo_type, signal_type), - max_date.date(), + max_date, "date of most recent generated file seems too recent")) self.increment_total_checks() @@ -680,7 +682,7 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, if mean_stddiff_high or mean_stdabsdiff_high: self.raised_errors.append(ValidationError( ("check_test_vs_reference_avg_changed", - checking_date.date(), geo_type, signal_type), + checking_date, geo_type, signal_type), (mean_stddiff_high, mean_stdabsdiff_high), 'Average differences in variables by geo_id between recent & reference data ' + 'seem large --- either large increase ' @@ -707,7 +709,7 @@ def validate(self, export_dir): # Make list of tuples of CSV names and regex match objects. validate_files = [(f, m) for (f, m) in export_files if date_filter(m)] - self.check_missing_dates(validate_files) + self.check_missing_date_files(validate_files) self.check_settings() all_frames = [] @@ -726,7 +728,8 @@ def validate(self, export_dir): # Get geo_type, date, and signal name as specified by CSV name. data_df['geo_type'] = match.groupdict()['geo_type'] - data_df['date'] = match.groupdict()['date'] + data_df['time_value'] = datetime.strptime( + match.groupdict()['date'], "%Y%m%d").date() data_df['signal'] = match.groupdict()['signal'] # Add current CSV data to all_frames. @@ -734,13 +737,8 @@ def validate(self, export_dir): all_frames = pd.concat(all_frames) - # Get list of dates seen in the source data. - date_slist = all_frames['date'].unique().tolist() - date_list = list( - map(lambda x: datetime.strptime(x, '%Y%m%d'), date_slist)) - # recent_lookbehind: start from the check date and working backward in time, - # how many days do we want to check for anomalies? + # how many days at a time do we want to check for anomalies? # Choosing 1 day checks just the daily data. recent_lookbehind = timedelta(days=1) @@ -748,24 +746,30 @@ def validate(self, export_dir): # in time, how many days do we use to form the reference statistics. semirecent_lookbehind = timedelta(days=7) + # Get list of dates we want to check. + date_list = [self.start_date + timedelta(days=days) + for days in range(self.span_length.days + 1)] + # Get all expected combinations of geo_type and signal. geo_signal_combos = get_geo_signal_combos(self.data_source) all_api_df = self.threaded_api_calls( - min(date_list), max(date_list), semirecent_lookbehind, geo_signal_combos) + self.start_date - min(semirecent_lookbehind, + self.max_check_lookbehind), + self.end_date, geo_signal_combos) # Keeps script from checking all files in a test run. if self.test_mode: kroc = 0 # Comparison checks - # Run checks for recent dates in each geo-sig combo vs semirecent (last week) - # API data. - for geo_sig_df, geo_type, signal_type in read_geo_signal_combo_files( - geo_signal_combos, - export_dir, - [name_match_pair[0] for name_match_pair in validate_files], - date_slist): + # Run checks for recent dates in each geo-sig combo vs semirecent (previous + # week) API data. + for geo_type, signal_type in geo_signal_combos: + geo_sig_df = all_frames.query( + "geo_type == @geo_type & signal == @signal_type") + # Drop unused columns. + geo_sig_df.drop(columns=["geo_type", "signal"]) self.increment_total_checks() @@ -816,13 +820,13 @@ def validate(self, export_dir): reference_api_df = geo_sig_api_df.query( "time_value >= @reference_start_date & time_value <= @reference_end_date") + self.increment_total_checks() + if reference_api_df.empty: - self.increment_total_checks() self.raised_errors.append(ValidationError( ("empty_reference_data", checking_date, geo_type, signal_type), None, "reference data is empty; comparative checks could not be performed")) - continue self.check_max_date_vs_reference( @@ -844,8 +848,9 @@ def validate(self, export_dir): self.exit() - def get_one_api_df(self, min_date, max_date, semirecent_lookbehind, - geo_type, signal_type, api_semaphore, dict_lock, output_dict): + def get_one_api_df(self, min_date, max_date, + geo_type, signal_type, + api_semaphore, dict_lock, output_dict): """ Pull API data for a single geo type-signal combination. Raises error if data couldn't be retrieved. Saves data to data dict. @@ -855,10 +860,7 @@ def get_one_api_df(self, min_date, max_date, semirecent_lookbehind, # Pull reference data from API for all dates. try: geo_sig_api_df = fetch_api_reference( - self.data_source, - min_date - min(semirecent_lookbehind, - self.max_check_lookbehind), - max_date, geo_type, signal_type) + self.data_source, min_date, max_date, geo_type, signal_type) except APIDataFetchError as e: self.increment_total_checks() @@ -874,7 +876,7 @@ def get_one_api_df(self, min_date, max_date, semirecent_lookbehind, output_dict[(geo_type, signal_type)] = geo_sig_api_df dict_lock.release() - def threaded_api_calls(self, min_date, max_date, semirecent_lookbehind, + def threaded_api_calls(self, min_date, max_date, geo_signal_combos, n_threads=32): """ Get data from API for all geo-signal combinations in a threaded way @@ -891,7 +893,6 @@ def threaded_api_calls(self, min_date, max_date, semirecent_lookbehind, thread_objs = [threading.Thread( target=self.get_one_api_df, args=(min_date, max_date, - semirecent_lookbehind, geo_type, signal_type, api_semaphore, dict_lock, output_dict) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index c3b0d63dc..c8e199f2d 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -2,7 +2,7 @@ from datetime import date, datetime, timedelta import numpy as np import pandas as pd -import pdb + from delphi_validator.datafetcher import filename_regex from delphi_validator.validate import Validator, make_date_filter @@ -153,7 +153,6 @@ class TestCheckBadGeoId: def test_empty_df(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) - pdb.set_trace() validator.check_bad_geo_id(empty_df, "name", "county") assert len(validator.raised_errors) == 0 From 5d8f2207b13cca2456c6b3e675af53462ddc6ec3 Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Mon, 2 Nov 2020 17:15:54 -0500 Subject: [PATCH 131/151] First draft of geo_id validator Code to validate that all geo_id values are valid, by comparing against a list of known values. --- validator/csv/county_geo.csv | 3194 ++++++++++++++++++++++++ validator/csv/dma_geo.csv | 211 ++ validator/csv/hrr_geo.csv | 307 +++ validator/csv/msa_geo.csv | 382 +++ validator/csv/state_geo.csv | 53 + validator/delphi_validator/validate.py | 20 +- validator/dev/unique_geoids.R | 12 + 7 files changed, 4178 insertions(+), 1 deletion(-) create mode 100644 validator/csv/county_geo.csv create mode 100644 validator/csv/dma_geo.csv create mode 100644 validator/csv/hrr_geo.csv create mode 100644 validator/csv/msa_geo.csv create mode 100644 validator/csv/state_geo.csv create mode 100644 validator/dev/unique_geoids.R diff --git a/validator/csv/county_geo.csv b/validator/csv/county_geo.csv new file mode 100644 index 000000000..a6127cf9b --- /dev/null +++ b/validator/csv/county_geo.csv @@ -0,0 +1,3194 @@ +"geo_id" +"01000" +"01001" +"01003" +"01005" +"01007" +"01009" +"01011" +"01013" +"01015" +"01017" +"01019" +"01021" +"01023" +"01025" +"01027" +"01029" +"01031" +"01033" +"01035" +"01037" +"01039" +"01041" +"01043" +"01045" +"01047" +"01049" +"01051" +"01053" +"01055" +"01057" +"01059" +"01061" +"01063" +"01065" +"01067" +"01069" +"01071" +"01073" +"01075" +"01077" +"01079" +"01081" +"01083" +"01085" +"01087" +"01089" +"01091" +"01093" +"01095" +"01097" +"01099" +"01101" +"01103" +"01105" +"01107" +"01109" +"01111" +"01113" +"01115" +"01117" +"01119" +"01121" +"01123" +"01125" +"01127" +"01129" +"01131" +"01133" +"02000" +"02013" +"02016" +"02020" +"02050" +"02060" +"02068" +"02070" +"02090" +"02100" +"02105" +"02110" +"02122" +"02130" +"02150" +"02158" +"02164" +"02170" +"02180" +"02185" +"02188" +"02195" +"02198" +"02220" +"02230" +"02240" +"02261" +"02275" +"02282" +"02290" +"04000" +"04001" +"04003" +"04005" +"04007" +"04009" +"04011" +"04012" +"04013" +"04015" +"04017" +"04019" +"04021" +"04023" +"04025" +"04027" +"05000" +"05001" +"05003" +"05005" +"05007" +"05009" +"05011" +"05013" +"05015" +"05017" +"05019" +"05021" +"05023" +"05025" +"05027" +"05029" +"05031" +"05033" +"05035" +"05037" +"05039" +"05041" +"05043" +"05045" +"05047" +"05049" +"05051" +"05053" +"05055" +"05057" +"05059" +"05061" +"05063" +"05065" +"05067" +"05069" +"05071" +"05073" +"05075" +"05077" +"05079" +"05081" +"05083" +"05085" +"05087" +"05089" +"05091" +"05093" +"05095" +"05097" +"05099" +"05101" +"05103" +"05105" +"05107" +"05109" +"05111" +"05113" +"05115" +"05117" +"05119" +"05121" +"05123" +"05125" +"05127" +"05129" +"05131" +"05133" +"05135" +"05137" +"05139" +"05141" +"05143" +"05145" +"05147" +"05149" +"06000" +"06001" +"06003" +"06005" +"06007" +"06009" +"06011" +"06013" +"06015" +"06017" +"06019" +"06021" +"06023" +"06025" +"06027" +"06029" +"06031" +"06033" +"06035" +"06037" +"06039" +"06041" +"06043" +"06045" +"06047" +"06049" +"06051" +"06053" +"06055" +"06057" +"06059" +"06061" +"06063" +"06065" +"06067" +"06069" +"06071" +"06073" +"06075" +"06077" +"06079" +"06081" +"06083" +"06085" +"06087" +"06089" +"06091" +"06093" +"06095" +"06097" +"06099" +"06101" +"06103" +"06105" +"06107" +"06109" +"06111" +"06113" +"06115" +"08000" +"08001" +"08003" +"08005" +"08007" +"08009" +"08011" +"08013" +"08014" +"08015" +"08017" +"08019" +"08021" +"08023" +"08025" +"08027" +"08029" +"08031" +"08033" +"08035" +"08037" +"08039" +"08041" +"08043" +"08045" +"08047" +"08049" +"08051" +"08053" +"08055" +"08057" +"08059" +"08061" +"08063" +"08065" +"08067" +"08069" +"08071" +"08073" +"08075" +"08077" +"08079" +"08081" +"08083" +"08085" +"08087" +"08089" +"08091" +"08093" +"08095" +"08097" +"08099" +"08101" +"08103" +"08105" +"08107" +"08109" +"08111" +"08113" +"08115" +"08117" +"08119" +"08121" +"08123" +"08125" +"09000" +"09001" +"09003" +"09005" +"09007" +"09009" +"09011" +"09013" +"09015" +"10000" +"10001" +"10003" +"10005" +"11001" +"12000" +"12001" +"12003" +"12005" +"12007" +"12009" +"12011" +"12013" +"12015" +"12017" +"12019" +"12021" +"12023" +"12027" +"12029" +"12031" +"12033" +"12035" +"12037" +"12039" +"12041" +"12043" +"12045" +"12047" +"12049" +"12051" +"12053" +"12055" +"12057" +"12059" +"12061" +"12063" +"12065" +"12067" +"12069" +"12071" +"12073" +"12075" +"12077" +"12079" +"12081" +"12083" +"12085" +"12086" +"12087" +"12089" +"12091" +"12093" +"12095" +"12097" +"12099" +"12101" +"12103" +"12105" +"12107" +"12109" +"12111" +"12113" +"12115" +"12117" +"12119" +"12121" +"12123" +"12125" +"12127" +"12129" +"12131" +"12133" +"13000" +"13001" +"13003" +"13005" +"13007" +"13009" +"13011" +"13013" +"13015" +"13017" +"13019" +"13021" +"13023" +"13025" +"13027" +"13029" +"13031" +"13033" +"13035" +"13037" +"13039" +"13043" +"13045" +"13047" +"13049" +"13051" +"13053" +"13055" +"13057" +"13059" +"13061" +"13063" +"13065" +"13067" +"13069" +"13071" +"13073" +"13075" +"13077" +"13079" +"13081" +"13083" +"13085" +"13087" +"13089" +"13091" +"13093" +"13095" +"13097" +"13099" +"13101" +"13103" +"13105" +"13107" +"13109" +"13111" +"13113" +"13115" +"13117" +"13119" +"13121" +"13123" +"13125" +"13127" +"13129" +"13131" +"13133" +"13135" +"13137" +"13139" +"13141" +"13143" +"13145" +"13147" +"13149" +"13151" +"13153" +"13155" +"13157" +"13159" +"13161" +"13163" +"13165" +"13167" +"13169" +"13171" +"13173" +"13175" +"13177" +"13179" +"13181" +"13183" +"13185" +"13187" +"13189" +"13191" +"13193" +"13195" +"13197" +"13199" +"13201" +"13205" +"13207" +"13209" +"13211" +"13213" +"13215" +"13217" +"13219" +"13221" +"13223" +"13225" +"13227" +"13229" +"13231" +"13233" +"13235" +"13237" +"13239" +"13241" +"13243" +"13245" +"13247" +"13249" +"13251" +"13253" +"13255" +"13257" +"13259" +"13261" +"13263" +"13265" +"13267" +"13269" +"13271" +"13273" +"13275" +"13277" +"13279" +"13281" +"13283" +"13285" +"13287" +"13289" +"13291" +"13293" +"13295" +"13297" +"13299" +"13301" +"13303" +"13305" +"13307" +"13309" +"13311" +"13313" +"13315" +"13317" +"13319" +"13321" +"15000" +"15001" +"15003" +"15005" +"15007" +"15009" +"16000" +"16001" +"16003" +"16005" +"16007" +"16009" +"16011" +"16013" +"16015" +"16017" +"16019" +"16021" +"16023" +"16025" +"16027" +"16029" +"16031" +"16033" +"16035" +"16037" +"16039" +"16041" +"16043" +"16045" +"16047" +"16049" +"16051" +"16053" +"16055" +"16057" +"16059" +"16061" +"16063" +"16065" +"16067" +"16069" +"16071" +"16073" +"16075" +"16077" +"16079" +"16081" +"16083" +"16085" +"16087" +"17000" +"17001" +"17003" +"17005" +"17007" +"17009" +"17011" +"17013" +"17015" +"17017" +"17019" +"17021" +"17023" +"17025" +"17027" +"17029" +"17031" +"17033" +"17035" +"17037" +"17039" +"17041" +"17043" +"17045" +"17047" +"17049" +"17051" +"17053" +"17055" +"17057" +"17059" +"17061" +"17063" +"17065" +"17067" +"17069" +"17071" +"17073" +"17075" +"17077" +"17079" +"17081" +"17083" +"17085" +"17087" +"17089" +"17091" +"17093" +"17095" +"17097" +"17099" +"17101" +"17103" +"17105" +"17107" +"17109" +"17111" +"17113" +"17115" +"17117" +"17119" +"17121" +"17123" +"17125" +"17127" +"17129" +"17131" +"17133" +"17135" +"17137" +"17139" +"17141" +"17143" +"17145" +"17147" +"17149" +"17151" +"17153" +"17155" +"17157" +"17159" +"17161" +"17163" +"17165" +"17167" +"17169" +"17171" +"17173" +"17175" +"17177" +"17179" +"17181" +"17183" +"17185" +"17187" +"17189" +"17191" +"17193" +"17195" +"17197" +"17199" +"17201" +"17203" +"18000" +"18001" +"18003" +"18005" +"18007" +"18009" +"18011" +"18013" +"18015" +"18017" +"18019" +"18021" +"18023" +"18025" +"18027" +"18029" +"18031" +"18033" +"18035" +"18037" +"18039" +"18041" +"18043" +"18045" +"18047" +"18049" +"18051" +"18053" +"18055" +"18057" +"18059" +"18061" +"18063" +"18065" +"18067" +"18069" +"18071" +"18073" +"18075" +"18077" +"18079" +"18081" +"18083" +"18085" +"18087" +"18089" +"18091" +"18093" +"18095" +"18097" +"18099" +"18101" +"18103" +"18105" +"18107" +"18109" +"18111" +"18113" +"18115" +"18117" +"18119" +"18121" +"18123" +"18125" +"18127" +"18129" +"18131" +"18133" +"18135" +"18137" +"18139" +"18141" +"18143" +"18145" +"18147" +"18149" +"18151" +"18153" +"18155" +"18157" +"18159" +"18161" +"18163" +"18165" +"18167" +"18169" +"18171" +"18173" +"18175" +"18177" +"18179" +"18181" +"18183" +"19000" +"19001" +"19003" +"19005" +"19007" +"19009" +"19011" +"19013" +"19015" +"19017" +"19019" +"19021" +"19023" +"19025" +"19027" +"19029" +"19031" +"19033" +"19035" +"19037" +"19039" +"19041" +"19043" +"19045" +"19047" +"19049" +"19051" +"19053" +"19055" +"19057" +"19059" +"19061" +"19063" +"19065" +"19067" +"19069" +"19071" +"19073" +"19075" +"19077" +"19079" +"19081" +"19083" +"19085" +"19087" +"19089" +"19091" +"19093" +"19095" +"19097" +"19099" +"19101" +"19103" +"19105" +"19107" +"19109" +"19111" +"19113" +"19115" +"19117" +"19119" +"19121" +"19123" +"19125" +"19127" +"19129" +"19131" +"19133" +"19135" +"19137" +"19139" +"19141" +"19143" +"19145" +"19147" +"19149" +"19151" +"19153" +"19155" +"19157" +"19159" +"19161" +"19163" +"19165" +"19167" +"19169" +"19171" +"19173" +"19175" +"19177" +"19179" +"19181" +"19183" +"19185" +"19187" +"19189" +"19191" +"19193" +"19195" +"19197" +"20000" +"20001" +"20003" +"20005" +"20007" +"20009" +"20011" +"20013" +"20015" +"20017" +"20019" +"20021" +"20023" +"20025" +"20027" +"20029" +"20031" +"20033" +"20035" +"20037" +"20039" +"20041" +"20043" +"20045" +"20047" +"20049" +"20051" +"20053" +"20055" +"20057" +"20059" +"20061" +"20063" +"20065" +"20067" +"20069" +"20071" +"20073" +"20075" +"20077" +"20079" +"20081" +"20083" +"20085" +"20087" +"20089" +"20091" +"20093" +"20095" +"20097" +"20099" +"20101" +"20103" +"20105" +"20107" +"20109" +"20111" +"20113" +"20115" +"20117" +"20119" +"20121" +"20123" +"20125" +"20127" +"20129" +"20131" +"20133" +"20135" +"20137" +"20139" +"20141" +"20143" +"20145" +"20147" +"20149" +"20151" +"20153" +"20155" +"20157" +"20159" +"20161" +"20163" +"20165" +"20167" +"20169" +"20171" +"20173" +"20175" +"20177" +"20179" +"20181" +"20183" +"20185" +"20187" +"20189" +"20191" +"20193" +"20195" +"20197" +"20199" +"20201" +"20203" +"20205" +"20207" +"20209" +"21000" +"21001" +"21003" +"21005" +"21007" +"21009" +"21011" +"21013" +"21015" +"21017" +"21019" +"21021" +"21023" +"21025" +"21027" +"21029" +"21031" +"21033" +"21035" +"21037" +"21039" +"21041" +"21043" +"21045" +"21047" +"21049" +"21051" +"21053" +"21055" +"21057" +"21059" +"21061" +"21063" +"21065" +"21067" +"21069" +"21071" +"21073" +"21075" +"21077" +"21079" +"21081" +"21083" +"21085" +"21087" +"21089" +"21091" +"21093" +"21095" +"21097" +"21099" +"21101" +"21103" +"21105" +"21107" +"21109" +"21111" +"21113" +"21115" +"21117" +"21119" +"21121" +"21123" +"21125" +"21127" +"21129" +"21131" +"21133" +"21135" +"21137" +"21139" +"21141" +"21143" +"21145" +"21147" +"21149" +"21151" +"21153" +"21155" +"21157" +"21159" +"21161" +"21163" +"21165" +"21167" +"21169" +"21171" +"21173" +"21175" +"21177" +"21179" +"21181" +"21183" +"21185" +"21187" +"21189" +"21191" +"21193" +"21195" +"21197" +"21199" +"21201" +"21203" +"21205" +"21207" +"21209" +"21211" +"21213" +"21215" +"21217" +"21219" +"21221" +"21223" +"21225" +"21227" +"21229" +"21231" +"21233" +"21235" +"21237" +"21239" +"22000" +"22001" +"22003" +"22005" +"22007" +"22009" +"22011" +"22013" +"22015" +"22017" +"22019" +"22021" +"22023" +"22025" +"22027" +"22029" +"22031" +"22033" +"22035" +"22037" +"22039" +"22041" +"22043" +"22045" +"22047" +"22049" +"22051" +"22053" +"22055" +"22057" +"22059" +"22061" +"22063" +"22065" +"22067" +"22069" +"22071" +"22073" +"22075" +"22077" +"22079" +"22081" +"22083" +"22085" +"22087" +"22089" +"22091" +"22093" +"22095" +"22097" +"22099" +"22101" +"22103" +"22105" +"22107" +"22109" +"22111" +"22113" +"22115" +"22117" +"22119" +"22121" +"22123" +"22125" +"22127" +"23000" +"23001" +"23003" +"23005" +"23007" +"23009" +"23011" +"23013" +"23015" +"23017" +"23019" +"23021" +"23023" +"23025" +"23027" +"23029" +"23031" +"24000" +"24001" +"24003" +"24005" +"24009" +"24011" +"24013" +"24015" +"24017" +"24019" +"24021" +"24023" +"24025" +"24027" +"24029" +"24031" +"24033" +"24035" +"24037" +"24039" +"24041" +"24043" +"24045" +"24047" +"24510" +"25000" +"25001" +"25003" +"25005" +"25007" +"25009" +"25011" +"25013" +"25015" +"25017" +"25019" +"25021" +"25023" +"25025" +"25027" +"26000" +"26001" +"26003" +"26005" +"26007" +"26009" +"26011" +"26013" +"26015" +"26017" +"26019" +"26021" +"26023" +"26025" +"26027" +"26029" +"26031" +"26033" +"26035" +"26037" +"26039" +"26041" +"26043" +"26045" +"26047" +"26049" +"26051" +"26053" +"26055" +"26057" +"26059" +"26061" +"26063" +"26065" +"26067" +"26069" +"26071" +"26073" +"26075" +"26077" +"26079" +"26081" +"26083" +"26085" +"26087" +"26089" +"26091" +"26093" +"26095" +"26097" +"26099" +"26101" +"26103" +"26105" +"26107" +"26109" +"26111" +"26113" +"26115" +"26117" +"26119" +"26121" +"26123" +"26125" +"26127" +"26129" +"26131" +"26133" +"26135" +"26137" +"26139" +"26141" +"26143" +"26145" +"26147" +"26149" +"26151" +"26153" +"26155" +"26157" +"26159" +"26161" +"26163" +"26165" +"27000" +"27001" +"27003" +"27005" +"27007" +"27009" +"27011" +"27013" +"27015" +"27017" +"27019" +"27021" +"27023" +"27025" +"27027" +"27029" +"27031" +"27033" +"27035" +"27037" +"27039" +"27041" +"27043" +"27045" +"27047" +"27049" +"27051" +"27053" +"27055" +"27057" +"27059" +"27061" +"27063" +"27065" +"27067" +"27069" +"27071" +"27073" +"27075" +"27077" +"27079" +"27081" +"27083" +"27085" +"27087" +"27089" +"27091" +"27093" +"27095" +"27097" +"27099" +"27101" +"27103" +"27105" +"27107" +"27109" +"27111" +"27113" +"27115" +"27117" +"27119" +"27121" +"27123" +"27125" +"27127" +"27129" +"27131" +"27133" +"27135" +"27137" +"27139" +"27141" +"27143" +"27145" +"27147" +"27149" +"27151" +"27153" +"27155" +"27157" +"27159" +"27161" +"27163" +"27165" +"27167" +"27169" +"27171" +"27173" +"28000" +"28001" +"28003" +"28005" +"28007" +"28009" +"28011" +"28013" +"28015" +"28017" +"28019" +"28021" +"28023" +"28025" +"28027" +"28029" +"28031" +"28033" +"28035" +"28037" +"28039" +"28041" +"28043" +"28045" +"28047" +"28049" +"28051" +"28053" +"28055" +"28057" +"28059" +"28061" +"28063" +"28065" +"28067" +"28069" +"28071" +"28073" +"28075" +"28077" +"28079" +"28081" +"28083" +"28085" +"28087" +"28089" +"28091" +"28093" +"28095" +"28097" +"28099" +"28101" +"28103" +"28105" +"28107" +"28109" +"28111" +"28113" +"28115" +"28117" +"28119" +"28121" +"28123" +"28125" +"28127" +"28129" +"28131" +"28133" +"28135" +"28137" +"28139" +"28141" +"28143" +"28145" +"28147" +"28149" +"28151" +"28153" +"28155" +"28157" +"28159" +"28161" +"28163" +"29000" +"29001" +"29003" +"29005" +"29007" +"29009" +"29011" +"29013" +"29015" +"29017" +"29019" +"29021" +"29023" +"29025" +"29027" +"29029" +"29031" +"29033" +"29035" +"29037" +"29039" +"29041" +"29043" +"29045" +"29047" +"29049" +"29051" +"29053" +"29055" +"29057" +"29059" +"29061" +"29063" +"29065" +"29067" +"29069" +"29071" +"29073" +"29075" +"29077" +"29079" +"29081" +"29083" +"29085" +"29087" +"29089" +"29091" +"29093" +"29095" +"29097" +"29099" +"29101" +"29103" +"29105" +"29107" +"29109" +"29111" +"29113" +"29115" +"29117" +"29119" +"29121" +"29123" +"29125" +"29127" +"29129" +"29131" +"29133" +"29135" +"29137" +"29139" +"29141" +"29143" +"29145" +"29147" +"29149" +"29151" +"29153" +"29155" +"29157" +"29159" +"29161" +"29163" +"29165" +"29167" +"29169" +"29171" +"29173" +"29175" +"29177" +"29179" +"29181" +"29183" +"29185" +"29186" +"29187" +"29189" +"29195" +"29197" +"29199" +"29201" +"29203" +"29205" +"29207" +"29209" +"29211" +"29213" +"29215" +"29217" +"29219" +"29221" +"29223" +"29225" +"29227" +"29229" +"29510" +"30000" +"30001" +"30003" +"30005" +"30007" +"30009" +"30011" +"30013" +"30015" +"30017" +"30019" +"30021" +"30023" +"30025" +"30027" +"30029" +"30031" +"30033" +"30035" +"30037" +"30039" +"30041" +"30043" +"30045" +"30047" +"30049" +"30051" +"30053" +"30055" +"30057" +"30059" +"30061" +"30063" +"30065" +"30067" +"30069" +"30071" +"30073" +"30075" +"30077" +"30079" +"30081" +"30083" +"30085" +"30087" +"30089" +"30091" +"30093" +"30095" +"30097" +"30099" +"30101" +"30103" +"30105" +"30107" +"30109" +"30111" +"31000" +"31001" +"31003" +"31005" +"31007" +"31009" +"31011" +"31013" +"31015" +"31017" +"31019" +"31021" +"31023" +"31025" +"31027" +"31029" +"31031" +"31033" +"31035" +"31037" +"31039" +"31041" +"31043" +"31045" +"31047" +"31049" +"31051" +"31053" +"31055" +"31057" +"31059" +"31061" +"31063" +"31065" +"31067" +"31069" +"31071" +"31073" +"31075" +"31077" +"31079" +"31081" +"31083" +"31085" +"31087" +"31089" +"31091" +"31093" +"31095" +"31097" +"31099" +"31101" +"31103" +"31105" +"31107" +"31109" +"31111" +"31113" +"31115" +"31117" +"31119" +"31121" +"31123" +"31125" +"31127" +"31129" +"31131" +"31133" +"31135" +"31137" +"31139" +"31141" +"31143" +"31145" +"31147" +"31149" +"31151" +"31153" +"31155" +"31157" +"31159" +"31161" +"31163" +"31165" +"31167" +"31169" +"31171" +"31173" +"31175" +"31177" +"31179" +"31181" +"31183" +"31185" +"32000" +"32001" +"32003" +"32005" +"32007" +"32009" +"32011" +"32013" +"32015" +"32017" +"32019" +"32021" +"32023" +"32027" +"32029" +"32031" +"32033" +"32510" +"33000" +"33001" +"33003" +"33005" +"33007" +"33009" +"33011" +"33013" +"33015" +"33017" +"33019" +"34000" +"34001" +"34003" +"34005" +"34007" +"34009" +"34011" +"34013" +"34015" +"34017" +"34019" +"34021" +"34023" +"34025" +"34027" +"34029" +"34031" +"34033" +"34035" +"34037" +"34039" +"34041" +"35000" +"35001" +"35003" +"35005" +"35006" +"35007" +"35009" +"35011" +"35013" +"35015" +"35017" +"35019" +"35021" +"35023" +"35025" +"35027" +"35028" +"35029" +"35031" +"35033" +"35035" +"35037" +"35039" +"35041" +"35043" +"35045" +"35047" +"35049" +"35051" +"35053" +"35055" +"35057" +"35059" +"35061" +"36000" +"36001" +"36003" +"36005" +"36007" +"36009" +"36011" +"36013" +"36015" +"36017" +"36019" +"36021" +"36023" +"36025" +"36027" +"36029" +"36031" +"36033" +"36035" +"36037" +"36039" +"36041" +"36043" +"36045" +"36047" +"36049" +"36051" +"36053" +"36055" +"36057" +"36059" +"36061" +"36063" +"36065" +"36067" +"36069" +"36071" +"36073" +"36075" +"36077" +"36079" +"36081" +"36083" +"36085" +"36087" +"36089" +"36091" +"36093" +"36095" +"36097" +"36099" +"36101" +"36103" +"36105" +"36107" +"36109" +"36111" +"36113" +"36115" +"36117" +"36119" +"36121" +"36123" +"37000" +"37001" +"37003" +"37005" +"37007" +"37009" +"37011" +"37013" +"37015" +"37017" +"37019" +"37021" +"37023" +"37025" +"37027" +"37029" +"37031" +"37033" +"37035" +"37037" +"37039" +"37041" +"37043" +"37045" +"37047" +"37049" +"37051" +"37053" +"37055" +"37057" +"37059" +"37061" +"37063" +"37065" +"37067" +"37069" +"37071" +"37073" +"37075" +"37077" +"37079" +"37081" +"37083" +"37085" +"37087" +"37089" +"37091" +"37093" +"37095" +"37097" +"37099" +"37101" +"37103" +"37105" +"37107" +"37109" +"37111" +"37113" +"37115" +"37117" +"37119" +"37121" +"37123" +"37125" +"37127" +"37129" +"37131" +"37133" +"37135" +"37137" +"37139" +"37141" +"37143" +"37145" +"37147" +"37149" +"37151" +"37153" +"37155" +"37157" +"37159" +"37161" +"37163" +"37165" +"37167" +"37169" +"37171" +"37173" +"37175" +"37177" +"37179" +"37181" +"37183" +"37185" +"37187" +"37189" +"37191" +"37193" +"37195" +"37197" +"37199" +"38000" +"38001" +"38003" +"38005" +"38007" +"38009" +"38011" +"38013" +"38015" +"38017" +"38019" +"38021" +"38023" +"38025" +"38027" +"38029" +"38031" +"38033" +"38035" +"38037" +"38039" +"38041" +"38043" +"38045" +"38047" +"38049" +"38051" +"38053" +"38055" +"38057" +"38059" +"38061" +"38063" +"38065" +"38067" +"38069" +"38071" +"38073" +"38075" +"38077" +"38079" +"38081" +"38083" +"38085" +"38087" +"38089" +"38091" +"38093" +"38095" +"38097" +"38099" +"38101" +"38103" +"38105" +"39000" +"39001" +"39003" +"39005" +"39007" +"39009" +"39011" +"39013" +"39015" +"39017" +"39019" +"39021" +"39023" +"39025" +"39027" +"39029" +"39031" +"39033" +"39035" +"39037" +"39039" +"39041" +"39043" +"39045" +"39047" +"39049" +"39051" +"39053" +"39055" +"39057" +"39059" +"39061" +"39063" +"39065" +"39067" +"39069" +"39071" +"39073" +"39075" +"39077" +"39079" +"39081" +"39083" +"39085" +"39087" +"39089" +"39091" +"39093" +"39095" +"39097" +"39099" +"39101" +"39103" +"39105" +"39107" +"39109" +"39111" +"39113" +"39115" +"39117" +"39119" +"39121" +"39123" +"39125" +"39127" +"39129" +"39131" +"39133" +"39135" +"39137" +"39139" +"39141" +"39143" +"39145" +"39147" +"39149" +"39151" +"39153" +"39155" +"39157" +"39159" +"39161" +"39163" +"39165" +"39167" +"39169" +"39171" +"39173" +"39175" +"40000" +"40001" +"40003" +"40005" +"40007" +"40009" +"40011" +"40013" +"40015" +"40017" +"40019" +"40021" +"40023" +"40025" +"40027" +"40029" +"40031" +"40033" +"40035" +"40037" +"40039" +"40041" +"40043" +"40045" +"40047" +"40049" +"40051" +"40053" +"40055" +"40057" +"40059" +"40061" +"40063" +"40065" +"40067" +"40069" +"40071" +"40073" +"40075" +"40077" +"40079" +"40081" +"40083" +"40085" +"40087" +"40089" +"40091" +"40093" +"40095" +"40097" +"40099" +"40101" +"40103" +"40105" +"40107" +"40109" +"40111" +"40113" +"40115" +"40117" +"40119" +"40121" +"40123" +"40125" +"40127" +"40129" +"40131" +"40133" +"40135" +"40137" +"40139" +"40141" +"40143" +"40145" +"40147" +"40149" +"40151" +"40153" +"41000" +"41001" +"41003" +"41005" +"41007" +"41009" +"41011" +"41013" +"41015" +"41017" +"41019" +"41021" +"41023" +"41025" +"41027" +"41029" +"41031" +"41033" +"41035" +"41037" +"41039" +"41041" +"41043" +"41045" +"41047" +"41049" +"41051" +"41053" +"41055" +"41057" +"41059" +"41061" +"41063" +"41065" +"41067" +"41069" +"41071" +"42000" +"42001" +"42003" +"42005" +"42007" +"42009" +"42011" +"42013" +"42015" +"42017" +"42019" +"42021" +"42023" +"42025" +"42027" +"42029" +"42031" +"42033" +"42035" +"42037" +"42039" +"42041" +"42043" +"42045" +"42047" +"42049" +"42051" +"42053" +"42055" +"42057" +"42059" +"42061" +"42063" +"42065" +"42067" +"42069" +"42071" +"42073" +"42075" +"42077" +"42079" +"42081" +"42083" +"42085" +"42087" +"42089" +"42091" +"42093" +"42095" +"42097" +"42099" +"42101" +"42103" +"42105" +"42107" +"42109" +"42111" +"42113" +"42115" +"42117" +"42119" +"42121" +"42123" +"42125" +"42127" +"42129" +"42131" +"42133" +"44000" +"44001" +"44003" +"44005" +"44007" +"44009" +"45000" +"45001" +"45003" +"45005" +"45007" +"45009" +"45011" +"45013" +"45015" +"45017" +"45019" +"45021" +"45023" +"45025" +"45027" +"45029" +"45031" +"45033" +"45035" +"45037" +"45039" +"45041" +"45043" +"45045" +"45047" +"45049" +"45051" +"45053" +"45055" +"45057" +"45059" +"45061" +"45063" +"45065" +"45067" +"45069" +"45071" +"45073" +"45075" +"45077" +"45079" +"45081" +"45083" +"45085" +"45087" +"45089" +"45091" +"46000" +"46003" +"46005" +"46007" +"46009" +"46011" +"46013" +"46015" +"46017" +"46019" +"46021" +"46023" +"46025" +"46027" +"46029" +"46031" +"46033" +"46035" +"46037" +"46039" +"46041" +"46043" +"46045" +"46047" +"46049" +"46051" +"46053" +"46055" +"46057" +"46059" +"46061" +"46063" +"46065" +"46067" +"46069" +"46071" +"46073" +"46075" +"46077" +"46079" +"46081" +"46083" +"46085" +"46087" +"46089" +"46091" +"46093" +"46095" +"46097" +"46099" +"46101" +"46102" +"46103" +"46105" +"46107" +"46109" +"46111" +"46115" +"46117" +"46119" +"46121" +"46123" +"46125" +"46127" +"46129" +"46135" +"46137" +"47000" +"47001" +"47003" +"47005" +"47007" +"47009" +"47011" +"47013" +"47015" +"47017" +"47019" +"47021" +"47023" +"47025" +"47027" +"47029" +"47031" +"47033" +"47035" +"47037" +"47039" +"47041" +"47043" +"47045" +"47047" +"47049" +"47051" +"47053" +"47055" +"47057" +"47059" +"47061" +"47063" +"47065" +"47067" +"47069" +"47071" +"47073" +"47075" +"47077" +"47079" +"47081" +"47083" +"47085" +"47087" +"47089" +"47091" +"47093" +"47095" +"47097" +"47099" +"47101" +"47103" +"47105" +"47107" +"47109" +"47111" +"47113" +"47115" +"47117" +"47119" +"47121" +"47123" +"47125" +"47127" +"47129" +"47131" +"47133" +"47135" +"47137" +"47139" +"47141" +"47143" +"47145" +"47147" +"47149" +"47151" +"47153" +"47155" +"47157" +"47159" +"47161" +"47163" +"47165" +"47167" +"47169" +"47171" +"47173" +"47175" +"47177" +"47179" +"47181" +"47183" +"47185" +"47187" +"47189" +"48000" +"48001" +"48003" +"48005" +"48007" +"48009" +"48011" +"48013" +"48015" +"48017" +"48019" +"48021" +"48023" +"48025" +"48027" +"48029" +"48031" +"48033" +"48035" +"48037" +"48039" +"48041" +"48043" +"48045" +"48047" +"48049" +"48051" +"48053" +"48055" +"48057" +"48059" +"48061" +"48063" +"48065" +"48067" +"48069" +"48071" +"48073" +"48075" +"48077" +"48079" +"48081" +"48083" +"48085" +"48087" +"48089" +"48091" +"48093" +"48095" +"48097" +"48099" +"48101" +"48103" +"48105" +"48107" +"48109" +"48111" +"48113" +"48115" +"48117" +"48119" +"48121" +"48123" +"48125" +"48127" +"48129" +"48131" +"48133" +"48135" +"48137" +"48139" +"48141" +"48143" +"48145" +"48147" +"48149" +"48151" +"48153" +"48155" +"48157" +"48159" +"48161" +"48163" +"48165" +"48167" +"48169" +"48171" +"48173" +"48175" +"48177" +"48179" +"48181" +"48183" +"48185" +"48187" +"48189" +"48191" +"48193" +"48195" +"48197" +"48199" +"48201" +"48203" +"48205" +"48207" +"48209" +"48211" +"48213" +"48215" +"48217" +"48219" +"48221" +"48223" +"48225" +"48227" +"48229" +"48231" +"48233" +"48235" +"48237" +"48239" +"48241" +"48243" +"48245" +"48247" +"48249" +"48251" +"48253" +"48255" +"48257" +"48259" +"48261" +"48263" +"48265" +"48267" +"48269" +"48271" +"48273" +"48275" +"48277" +"48279" +"48281" +"48283" +"48285" +"48287" +"48289" +"48291" +"48293" +"48295" +"48297" +"48299" +"48301" +"48303" +"48305" +"48307" +"48309" +"48311" +"48313" +"48315" +"48317" +"48319" +"48321" +"48323" +"48325" +"48327" +"48329" +"48331" +"48333" +"48335" +"48337" +"48339" +"48341" +"48343" +"48345" +"48347" +"48349" +"48351" +"48353" +"48355" +"48357" +"48359" +"48361" +"48363" +"48365" +"48367" +"48369" +"48371" +"48373" +"48375" +"48377" +"48379" +"48381" +"48383" +"48385" +"48387" +"48389" +"48391" +"48393" +"48395" +"48397" +"48399" +"48401" +"48403" +"48405" +"48407" +"48409" +"48411" +"48413" +"48415" +"48417" +"48419" +"48421" +"48423" +"48425" +"48427" +"48429" +"48431" +"48433" +"48435" +"48437" +"48439" +"48441" +"48443" +"48445" +"48447" +"48449" +"48451" +"48453" +"48455" +"48457" +"48459" +"48461" +"48463" +"48465" +"48467" +"48469" +"48471" +"48473" +"48475" +"48477" +"48479" +"48481" +"48483" +"48485" +"48487" +"48489" +"48491" +"48493" +"48495" +"48497" +"48499" +"48501" +"48503" +"48505" +"48507" +"49000" +"49001" +"49003" +"49005" +"49007" +"49009" +"49011" +"49013" +"49015" +"49017" +"49019" +"49021" +"49023" +"49025" +"49027" +"49029" +"49031" +"49033" +"49035" +"49037" +"49039" +"49041" +"49043" +"49045" +"49047" +"49049" +"49051" +"49053" +"49055" +"49057" +"50000" +"50001" +"50003" +"50005" +"50007" +"50009" +"50011" +"50013" +"50015" +"50017" +"50019" +"50021" +"50023" +"50025" +"50027" +"51000" +"51001" +"51003" +"51005" +"51007" +"51009" +"51011" +"51013" +"51015" +"51017" +"51019" +"51021" +"51023" +"51025" +"51027" +"51029" +"51031" +"51033" +"51035" +"51036" +"51037" +"51041" +"51043" +"51045" +"51047" +"51049" +"51051" +"51053" +"51057" +"51059" +"51061" +"51063" +"51065" +"51067" +"51069" +"51071" +"51073" +"51075" +"51077" +"51079" +"51081" +"51083" +"51085" +"51087" +"51089" +"51091" +"51093" +"51095" +"51097" +"51099" +"51101" +"51103" +"51105" +"51107" +"51109" +"51111" +"51113" +"51115" +"51117" +"51119" +"51121" +"51125" +"51127" +"51131" +"51133" +"51135" +"51137" +"51139" +"51141" +"51143" +"51145" +"51147" +"51149" +"51153" +"51155" +"51157" +"51159" +"51161" +"51163" +"51165" +"51167" +"51169" +"51171" +"51173" +"51175" +"51177" +"51179" +"51181" +"51183" +"51185" +"51187" +"51191" +"51193" +"51195" +"51197" +"51199" +"51510" +"51520" +"51530" +"51540" +"51550" +"51570" +"51580" +"51590" +"51595" +"51600" +"51610" +"51620" +"51630" +"51640" +"51650" +"51660" +"51670" +"51678" +"51680" +"51683" +"51685" +"51690" +"51700" +"51710" +"51720" +"51730" +"51735" +"51740" +"51750" +"51760" +"51770" +"51775" +"51790" +"51800" +"51810" +"51820" +"51830" +"51840" +"53000" +"53001" +"53003" +"53005" +"53007" +"53009" +"53011" +"53013" +"53015" +"53017" +"53019" +"53021" +"53023" +"53025" +"53027" +"53029" +"53031" +"53033" +"53035" +"53037" +"53039" +"53041" +"53043" +"53045" +"53047" +"53049" +"53051" +"53053" +"53055" +"53057" +"53059" +"53061" +"53063" +"53065" +"53067" +"53069" +"53071" +"53073" +"53075" +"53077" +"54000" +"54001" +"54003" +"54005" +"54007" +"54009" +"54011" +"54013" +"54015" +"54017" +"54019" +"54021" +"54023" +"54025" +"54027" +"54029" +"54031" +"54033" +"54035" +"54037" +"54039" +"54041" +"54043" +"54045" +"54047" +"54049" +"54051" +"54053" +"54055" +"54057" +"54059" +"54061" +"54063" +"54065" +"54067" +"54069" +"54071" +"54073" +"54075" +"54077" +"54079" +"54081" +"54083" +"54085" +"54087" +"54089" +"54091" +"54093" +"54095" +"54097" +"54099" +"54101" +"54103" +"54105" +"54107" +"54109" +"55000" +"55001" +"55003" +"55005" +"55007" +"55009" +"55011" +"55013" +"55015" +"55017" +"55019" +"55021" +"55023" +"55025" +"55027" +"55029" +"55031" +"55033" +"55035" +"55037" +"55039" +"55041" +"55043" +"55045" +"55047" +"55049" +"55051" +"55053" +"55055" +"55057" +"55059" +"55061" +"55063" +"55065" +"55067" +"55069" +"55071" +"55073" +"55075" +"55077" +"55078" +"55079" +"55081" +"55083" +"55085" +"55087" +"55089" +"55091" +"55093" +"55095" +"55097" +"55099" +"55101" +"55103" +"55105" +"55107" +"55109" +"55111" +"55113" +"55115" +"55117" +"55119" +"55121" +"55123" +"55125" +"55127" +"55129" +"55131" +"55133" +"55135" +"55137" +"55139" +"55141" +"56000" +"56001" +"56003" +"56005" +"56007" +"56009" +"56011" +"56013" +"56015" +"56017" +"56019" +"56021" +"56023" +"56025" +"56027" +"56029" +"56031" +"56033" +"56035" +"56037" +"56039" +"56041" +"56043" +"56045" +"72000" diff --git a/validator/csv/dma_geo.csv b/validator/csv/dma_geo.csv new file mode 100644 index 000000000..3315ebd11 --- /dev/null +++ b/validator/csv/dma_geo.csv @@ -0,0 +1,211 @@ +"geo_id" +"500" +"501" +"502" +"503" +"504" +"505" +"506" +"507" +"508" +"509" +"510" +"511" +"512" +"513" +"514" +"515" +"516" +"517" +"518" +"519" +"520" +"521" +"522" +"523" +"524" +"525" +"526" +"527" +"528" +"529" +"530" +"531" +"532" +"533" +"534" +"535" +"536" +"537" +"538" +"539" +"540" +"541" +"542" +"543" +"544" +"545" +"546" +"547" +"548" +"549" +"550" +"551" +"552" +"553" +"554" +"555" +"556" +"557" +"558" +"559" +"560" +"561" +"563" +"564" +"565" +"566" +"567" +"569" +"570" +"571" +"573" +"574" +"575" +"576" +"577" +"581" +"582" +"583" +"584" +"588" +"592" +"596" +"597" +"598" +"600" +"602" +"603" +"604" +"605" +"606" +"609" +"610" +"611" +"612" +"613" +"616" +"617" +"618" +"619" +"622" +"623" +"624" +"625" +"626" +"627" +"628" +"630" +"631" +"632" +"633" +"634" +"635" +"636" +"637" +"638" +"639" +"640" +"641" +"642" +"643" +"644" +"647" +"648" +"649" +"650" +"651" +"652" +"656" +"657" +"658" +"659" +"661" +"662" +"669" +"670" +"671" +"673" +"675" +"676" +"678" +"679" +"682" +"686" +"687" +"691" +"692" +"693" +"698" +"702" +"705" +"709" +"710" +"711" +"716" +"717" +"718" +"722" +"724" +"725" +"734" +"736" +"737" +"740" +"743" +"744" +"745" +"746" +"747" +"749" +"751" +"752" +"753" +"754" +"755" +"756" +"757" +"758" +"759" +"760" +"762" +"764" +"765" +"766" +"767" +"770" +"771" +"773" +"789" +"790" +"798" +"800" +"801" +"802" +"803" +"804" +"807" +"810" +"811" +"813" +"819" +"820" +"821" +"825" +"828" +"839" +"855" +"862" +"866" +"868" +"881" diff --git a/validator/csv/hrr_geo.csv b/validator/csv/hrr_geo.csv new file mode 100644 index 000000000..4e9042de5 --- /dev/null +++ b/validator/csv/hrr_geo.csv @@ -0,0 +1,307 @@ +"geo_id" +"1" +"10" +"101" +"102" +"103" +"104" +"105" +"106" +"107" +"109" +"11" +"110" +"111" +"112" +"113" +"115" +"116" +"118" +"119" +"12" +"120" +"122" +"123" +"124" +"127" +"129" +"130" +"131" +"133" +"134" +"137" +"139" +"14" +"140" +"141" +"142" +"144" +"145" +"146" +"147" +"148" +"149" +"15" +"150" +"151" +"152" +"154" +"155" +"156" +"158" +"16" +"161" +"163" +"164" +"166" +"170" +"171" +"172" +"173" +"175" +"179" +"18" +"180" +"181" +"183" +"184" +"185" +"186" +"187" +"188" +"19" +"190" +"191" +"192" +"193" +"194" +"195" +"196" +"197" +"2" +"200" +"201" +"203" +"204" +"205" +"207" +"208" +"209" +"21" +"210" +"212" +"213" +"214" +"216" +"217" +"218" +"219" +"22" +"220" +"221" +"222" +"223" +"225" +"226" +"227" +"23" +"230" +"231" +"232" +"233" +"234" +"235" +"236" +"238" +"239" +"240" +"242" +"243" +"244" +"245" +"246" +"248" +"249" +"25" +"250" +"251" +"253" +"254" +"256" +"257" +"258" +"259" +"260" +"261" +"262" +"263" +"264" +"267" +"268" +"270" +"273" +"274" +"275" +"276" +"277" +"278" +"279" +"280" +"281" +"282" +"283" +"284" +"285" +"288" +"289" +"291" +"292" +"293" +"295" +"296" +"297" +"299" +"300" +"301" +"303" +"304" +"307" +"308" +"309" +"31" +"311" +"312" +"313" +"314" +"315" +"318" +"319" +"320" +"321" +"322" +"323" +"324" +"325" +"326" +"327" +"328" +"329" +"33" +"330" +"331" +"332" +"334" +"335" +"336" +"339" +"340" +"341" +"342" +"343" +"344" +"345" +"346" +"347" +"350" +"351" +"352" +"354" +"355" +"356" +"357" +"358" +"359" +"360" +"362" +"363" +"364" +"365" +"366" +"367" +"368" +"369" +"370" +"371" +"373" +"374" +"375" +"376" +"377" +"379" +"380" +"382" +"383" +"385" +"386" +"388" +"390" +"391" +"393" +"394" +"396" +"397" +"399" +"400" +"402" +"406" +"411" +"412" +"413" +"416" +"417" +"418" +"420" +"421" +"422" +"423" +"424" +"426" +"427" +"428" +"429" +"43" +"430" +"431" +"432" +"435" +"437" +"438" +"439" +"440" +"441" +"442" +"443" +"444" +"445" +"446" +"447" +"448" +"449" +"450" +"451" +"452" +"456" +"457" +"5" +"56" +"58" +"6" +"62" +"65" +"69" +"7" +"73" +"77" +"78" +"79" +"80" +"81" +"82" +"83" +"85" +"86" +"87" +"89" +"9" +"91" +"96" diff --git a/validator/csv/msa_geo.csv b/validator/csv/msa_geo.csv new file mode 100644 index 000000000..9025de71a --- /dev/null +++ b/validator/csv/msa_geo.csv @@ -0,0 +1,382 @@ +"geo_id" +"10180" +"10420" +"10500" +"10540" +"10580" +"10740" +"10780" +"10900" +"11020" +"11100" +"11180" +"11260" +"11460" +"11500" +"11540" +"11700" +"12020" +"12060" +"12100" +"12220" +"12260" +"12420" +"12540" +"12580" +"12620" +"12700" +"12940" +"12980" +"13020" +"13140" +"13220" +"13380" +"13460" +"13740" +"13780" +"13820" +"13900" +"13980" +"14010" +"14020" +"14100" +"14260" +"14460" +"14500" +"14540" +"14740" +"14860" +"15180" +"15260" +"15380" +"15500" +"15540" +"15680" +"15940" +"15980" +"16020" +"16060" +"16180" +"16220" +"16300" +"16540" +"16580" +"16620" +"16700" +"16740" +"16820" +"16860" +"16940" +"16980" +"17020" +"17140" +"17300" +"17420" +"17460" +"17660" +"17780" +"17820" +"17860" +"17900" +"17980" +"18020" +"18140" +"18580" +"18700" +"18880" +"19060" +"19100" +"19140" +"19180" +"19300" +"19340" +"19430" +"19460" +"19500" +"19660" +"19740" +"19780" +"19820" +"20020" +"20100" +"20220" +"20260" +"20500" +"20700" +"20740" +"20940" +"21060" +"21140" +"21300" +"21340" +"21420" +"21500" +"21660" +"21780" +"21820" +"22020" +"22140" +"22180" +"22220" +"22380" +"22420" +"22500" +"22520" +"22540" +"22660" +"22900" +"23060" +"23420" +"23460" +"23540" +"23580" +"23900" +"24020" +"24140" +"24220" +"24260" +"24300" +"24340" +"24420" +"24500" +"24540" +"24580" +"24660" +"24780" +"24860" +"25060" +"25180" +"25220" +"25260" +"25420" +"25540" +"25620" +"25860" +"25940" +"25980" +"26140" +"26300" +"26380" +"26420" +"26580" +"26620" +"26820" +"26900" +"26980" +"27060" +"27100" +"27140" +"27180" +"27260" +"27340" +"27500" +"27620" +"27740" +"27780" +"27860" +"27900" +"28020" +"28100" +"28140" +"28420" +"28660" +"28700" +"28740" +"28940" +"29020" +"29100" +"29180" +"29200" +"29340" +"29420" +"29460" +"29540" +"29620" +"29700" +"29740" +"29820" +"29940" +"30020" +"30140" +"30300" +"30340" +"30460" +"30620" +"30700" +"30780" +"30860" +"30980" +"31020" +"31080" +"31140" +"31180" +"31340" +"31420" +"31460" +"31540" +"31700" +"31740" +"31860" +"31900" +"32580" +"32780" +"32820" +"32900" +"33100" +"33140" +"33220" +"33260" +"33340" +"33460" +"33540" +"33660" +"33700" +"33740" +"33780" +"33860" +"34060" +"34100" +"34580" +"34620" +"34740" +"34820" +"34900" +"34940" +"34980" +"35100" +"35300" +"35380" +"35620" +"35660" +"35840" +"35980" +"36100" +"36140" +"36220" +"36260" +"36420" +"36500" +"36540" +"36740" +"36780" +"36980" +"37100" +"37340" +"37460" +"37620" +"37860" +"37900" +"37980" +"38060" +"38220" +"38300" +"38340" +"38540" +"38860" +"38900" +"38940" +"39100" +"39150" +"39300" +"39340" +"39380" +"39460" +"39540" +"39580" +"39660" +"39740" +"39820" +"39900" +"40060" +"40140" +"40220" +"40340" +"40380" +"40420" +"40580" +"40660" +"40900" +"40980" +"41060" +"41100" +"41140" +"41180" +"41420" +"41500" +"41540" +"41620" +"41660" +"41700" +"41740" +"41860" +"41940" +"42020" +"42100" +"42140" +"42200" +"42220" +"42340" +"42540" +"42660" +"42680" +"42700" +"43100" +"43300" +"43340" +"43420" +"43580" +"43620" +"43780" +"43900" +"44060" +"44100" +"44140" +"44180" +"44220" +"44300" +"44700" +"44940" +"45060" +"45220" +"45300" +"45460" +"45500" +"45540" +"45780" +"45820" +"45940" +"46060" +"46140" +"46220" +"46300" +"46340" +"46520" +"46540" +"46660" +"46700" +"47020" +"47220" +"47260" +"47300" +"47380" +"47460" +"47580" +"47900" +"47940" +"48060" +"48140" +"48260" +"48300" +"48540" +"48620" +"48660" +"48700" +"48900" +"49020" +"49180" +"49340" +"49420" +"49620" +"49660" +"49700" +"49740" diff --git a/validator/csv/state_geo.csv b/validator/csv/state_geo.csv new file mode 100644 index 000000000..e4d129ad6 --- /dev/null +++ b/validator/csv/state_geo.csv @@ -0,0 +1,53 @@ +"geo_id" +"ak" +"al" +"ar" +"az" +"ca" +"co" +"ct" +"dc" +"de" +"fl" +"ga" +"hi" +"ia" +"id" +"il" +"in" +"ks" +"ky" +"la" +"ma" +"md" +"me" +"mi" +"mn" +"mo" +"ms" +"mt" +"nc" +"nd" +"ne" +"nh" +"nj" +"nm" +"nv" +"ny" +"oh" +"ok" +"or" +"pa" +"pr" +"ri" +"sc" +"sd" +"tn" +"tx" +"ut" +"va" +"vt" +"wa" +"wi" +"wv" +"wy" diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index a56cfe4c4..99f1ec107 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -244,6 +244,23 @@ def check_df_format(self, df_to_test, nameformat): self.increment_total_checks() + def check_bad_geo_value(self, geo_type, df_to_test): + """ + Check for unknown geo_ids, by comparing to historic data + + Arguments: + - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data + - df_to_test: pandas dataframe of CSV source data containing the geo_ids to check + """ + file_path = join(r'../validator/csv', geo_type + '_geo.csv') + valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str}) + valid_geos = valid_geo_df['geo_value'].values + unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo not in valid_geos] + if len(unexpected_geos) > 0: + self.raised_errors.append(ValidationError( + ("check_bad_geo_value", geo_type), + unexpected_geos, "invalid geo_ids found")) + def check_bad_geo_id(self, df_to_test, nameformat, geo_type): """ Check validity of geo type and values, according to regex pattern. @@ -722,6 +739,7 @@ def validate(self, export_dir): self.check_df_format(data_df, filename) self.check_bad_geo_id( data_df, filename, match.groupdict()['geo_type']) + self.check_bad_geo_value(match.groupdict()['geo_type'], data_df) self.check_bad_val(data_df, filename, match.groupdict()['signal']) self.check_bad_se(data_df, filename) self.check_bad_sample_size(data_df, filename) @@ -941,4 +959,4 @@ def exit(self): if len(subset_raised_errors) != 0: sys.exit(1) else: - sys.exit(0) + sys.exit(0) \ No newline at end of file diff --git a/validator/dev/unique_geoids.R b/validator/dev/unique_geoids.R new file mode 100644 index 000000000..851936b58 --- /dev/null +++ b/validator/dev/unique_geoids.R @@ -0,0 +1,12 @@ +library(covidcast) + +geo_types = c("county", "state", "hrr", "msa") +for(type in geo_types){ + dtf = covidcast_signal("indicator-combination", "confirmed_7dav_incidence_num", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = type) + file_name = paste0("csv/", type, "_geo.csv") + write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") +} + +dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma") +file_name = paste0("csv/dma_geo.csv") +write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") \ No newline at end of file From f2562e6b13e8c45dd500813e2e957f69b7eee792 Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Mon, 2 Nov 2020 17:28:28 -0500 Subject: [PATCH 132/151] Fix typo (geo_value -> geo_id) --- validator/delphi_validator/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 99f1ec107..2a50978b1 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -254,7 +254,7 @@ def check_bad_geo_value(self, geo_type, df_to_test): """ file_path = join(r'../validator/csv', geo_type + '_geo.csv') valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str}) - valid_geos = valid_geo_df['geo_value'].values + valid_geos = valid_geo_df['geo_id'].values unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo not in valid_geos] if len(unexpected_geos) > 0: self.raised_errors.append(ValidationError( From ac28c334953eed2678383e738b26f2f1fd4a73d3 Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Tue, 3 Nov 2020 13:31:44 -0500 Subject: [PATCH 133/151] Minor fixes - Clarify format vs. value checks - move files from csv/ -> static/ --- validator/delphi_validator/validate.py | 20 ++++++++++---------- validator/dev/unique_geoids.R | 4 ++-- validator/{csv => static}/county_geo.csv | 0 validator/{csv => static}/dma_geo.csv | 0 validator/{csv => static}/hrr_geo.csv | 0 validator/{csv => static}/msa_geo.csv | 0 validator/{csv => static}/state_geo.csv | 0 7 files changed, 12 insertions(+), 12 deletions(-) rename validator/{csv => static}/county_geo.csv (100%) rename validator/{csv => static}/dma_geo.csv (100%) rename validator/{csv => static}/hrr_geo.csv (100%) rename validator/{csv => static}/msa_geo.csv (100%) rename validator/{csv => static}/state_geo.csv (100%) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 2a50978b1..6e1bbc6c7 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -244,26 +244,26 @@ def check_df_format(self, df_to_test, nameformat): self.increment_total_checks() - def check_bad_geo_value(self, geo_type, df_to_test): + def check_bad_geo_id_value(self, df_to_test, geo_type): """ - Check for unknown geo_ids, by comparing to historic data + Check for bad geo_id values, by comparing to a list of known values (drawn from historical data) Arguments: + - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data - - df_to_test: pandas dataframe of CSV source data containing the geo_ids to check """ - file_path = join(r'../validator/csv', geo_type + '_geo.csv') + file_path = join(r'../validator/static', geo_type + '_geo.csv') valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str}) valid_geos = valid_geo_df['geo_id'].values unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo not in valid_geos] if len(unexpected_geos) > 0: self.raised_errors.append(ValidationError( - ("check_bad_geo_value", geo_type), - unexpected_geos, "invalid geo_ids found")) + ("check_bad_geo_id_value", filename), + unexpected_geos, "Unrecognized geo_ids (not in historical data)")) - def check_bad_geo_id(self, df_to_test, nameformat, geo_type): + def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): """ - Check validity of geo type and values, according to regex pattern. + Check validity of geo_type and format of geo_ids, according to regex pattern. Arguments: - df_to_test: pandas dataframe of CSV source data @@ -737,9 +737,9 @@ def validate(self, export_dir): data_df = load_csv(join(export_dir, filename)) self.check_df_format(data_df, filename) - self.check_bad_geo_id( + self.check_bad_geo_id_format( data_df, filename, match.groupdict()['geo_type']) - self.check_bad_geo_value(match.groupdict()['geo_type'], data_df) + self.check_bad_geo_id_value(data_df, match.groupdict()['geo_type']) self.check_bad_val(data_df, filename, match.groupdict()['signal']) self.check_bad_se(data_df, filename) self.check_bad_sample_size(data_df, filename) diff --git a/validator/dev/unique_geoids.R b/validator/dev/unique_geoids.R index 851936b58..c78f87d28 100644 --- a/validator/dev/unique_geoids.R +++ b/validator/dev/unique_geoids.R @@ -3,10 +3,10 @@ library(covidcast) geo_types = c("county", "state", "hrr", "msa") for(type in geo_types){ dtf = covidcast_signal("indicator-combination", "confirmed_7dav_incidence_num", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = type) - file_name = paste0("csv/", type, "_geo.csv") + file_name = paste0("static/", type, "_geo.csv") write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") } dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma") -file_name = paste0("csv/dma_geo.csv") +file_name = paste0("static/dma_geo.csv") write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") \ No newline at end of file diff --git a/validator/csv/county_geo.csv b/validator/static/county_geo.csv similarity index 100% rename from validator/csv/county_geo.csv rename to validator/static/county_geo.csv diff --git a/validator/csv/dma_geo.csv b/validator/static/dma_geo.csv similarity index 100% rename from validator/csv/dma_geo.csv rename to validator/static/dma_geo.csv diff --git a/validator/csv/hrr_geo.csv b/validator/static/hrr_geo.csv similarity index 100% rename from validator/csv/hrr_geo.csv rename to validator/static/hrr_geo.csv diff --git a/validator/csv/msa_geo.csv b/validator/static/msa_geo.csv similarity index 100% rename from validator/csv/msa_geo.csv rename to validator/static/msa_geo.csv diff --git a/validator/csv/state_geo.csv b/validator/static/state_geo.csv similarity index 100% rename from validator/csv/state_geo.csv rename to validator/static/state_geo.csv From 0ea0a79896272a87c9f0bcb82ee76cf4eb989d7d Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Tue, 3 Nov 2020 14:13:59 -0500 Subject: [PATCH 134/151] Allow setting static directory in template --- validator/delphi_validator/validate.py | 3 ++- validator/params.json.template | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 6e1bbc6c7..d616dafd0 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -113,6 +113,7 @@ def __init__(self, params): """ # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] + self.validator_static_file_dir = params.get('validator_static_file_dir', '../validator/static') # Date/time settings self.span_length = timedelta(days=params['span_length']) @@ -252,7 +253,7 @@ def check_bad_geo_id_value(self, df_to_test, geo_type): - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data """ - file_path = join(r'../validator/static', geo_type + '_geo.csv') + file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv') valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str}) valid_geos = valid_geo_df['geo_id'].values unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo not in valid_geos] diff --git a/validator/params.json.template b/validator/params.json.template index 2b71c0f89..643dc7838 100644 --- a/validator/params.json.template +++ b/validator/params.json.template @@ -4,6 +4,7 @@ "end_date": "2020-09-08", "span_length": 3, "ref_window_size": 7, + "validator_static_file_dir": "../validator/static", "minimum_sample_size": 100, "missing_se_allowed": true, "missing_sample_size_allowed": true, From 69881a4128abbb3f58727449f8e9002cc57747cd Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Tue, 3 Nov 2020 14:29:35 -0500 Subject: [PATCH 135/151] rename directory (dev -> scripts) Renamed directory and file (unique_geoids.R) is now expected to be run from within the directory instead of from one level up. --- validator/{dev => scripts}/unique_geoids.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename validator/{dev => scripts}/unique_geoids.R (84%) diff --git a/validator/dev/unique_geoids.R b/validator/scripts/unique_geoids.R similarity index 84% rename from validator/dev/unique_geoids.R rename to validator/scripts/unique_geoids.R index c78f87d28..fb4044a56 100644 --- a/validator/dev/unique_geoids.R +++ b/validator/scripts/unique_geoids.R @@ -3,10 +3,10 @@ library(covidcast) geo_types = c("county", "state", "hrr", "msa") for(type in geo_types){ dtf = covidcast_signal("indicator-combination", "confirmed_7dav_incidence_num", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = type) - file_name = paste0("static/", type, "_geo.csv") + file_name = paste0("../static/", type, "_geo.csv") write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") } dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma") -file_name = paste0("static/dma_geo.csv") +file_name = paste0("../static/dma_geo.csv") write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") \ No newline at end of file From 9ac11d8d2fc16a1e9e10e567c074753f1f9d366d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 3 Nov 2020 18:55:46 -0500 Subject: [PATCH 136/151] update unittests with function name changes --- validator/tests/test_checks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index c8e199f2d..a61c1764a 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -82,7 +82,7 @@ def test_empty_filelist(self): validator = Validator(params) filenames = list() - validator.check_missing_dates(filenames) + validator.check_missing_date_files(filenames) assert len(validator.raised_errors) == 1 assert "check_missing_date_files" in [ @@ -95,7 +95,7 @@ def test_same_day(self): validator = Validator(params) filenames = [("20200901_county_signal_signal.csv", "match_obj")] - validator.check_missing_dates(filenames) + validator.check_missing_date_files(filenames) assert len(validator.raised_errors) == 0 assert "check_missing_date_files" not in [ @@ -110,7 +110,7 @@ def test_duplicate_dates(self): ("20200903_county_signal_signal.csv", "match_obj"), ("20200903_usa_signal_signal.csv", "match_obj"), ("20200903_usa_signal_signal.csv", "match_obj")] - validator.check_missing_dates(filenames) + validator.check_missing_date_files(filenames) assert len(validator.raised_errors) == 1 assert "check_missing_date_files" in [ From 4bbe7877d26987deb2859c7f10cf73af9d0a5464 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 3 Nov 2020 19:14:33 -0500 Subject: [PATCH 137/151] update plans --- validator/PLANS.md | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 84ab0de0f..c0df0412e 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -6,6 +6,7 @@ * Recognized file name format * Recognized geographical type (county, state, etc) * Recognized geo id format (e.g. state is two lowercase letters) +* Geo id has been seen before, in historical data * Missing geo type + signal + date combos based on the geo type + signal combos Covidcast metadata says should be available * Missing ‘val’ values * Negative ‘val’ values @@ -21,6 +22,8 @@ * Most recent date seen in source data is not older than most recent date seen in reference data * Similar number of obs per day as recent API data (static threshold) * Similar average value as API data (static threshold) +* Source data for specified date range is empty +* API data for specified date range is empty ## Current features @@ -33,15 +36,14 @@ ## Checks + features wishlist, and problems to think about: -* Improve performance and reduce runtime (what's the goal?) +* Improve performance and reduce runtime (what's the target time?) * Profiling (iterate) - * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section) - * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance. -* Which, if any, *specific* geo_ids are missing (get unique geo ids from historical data or delphi_utils) + * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section. Parallelize?) + * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance or may just make access more readable. * Check for duplicate rows * Check explicitly for large spikes (avg_val check can detect jumps in average value) * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. -* Run check_missing_dates on every geo type-signal type separately. Probably move check to geo_sig loop. +* Run check_missing_date_files (or similar) on every geo type-signal type separately in comparative checks loop. * Different test thresholds for different files? Currently some control based on smoothed vs raw signals * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days @@ -56,11 +58,17 @@ * Correct p-values for multiple testing * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family"; [comparison of the two](https://delphi-org.slack.com/archives/D01A9KNTPKL/p1603294915000500) * Nicer formatting for error “report”. - * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each + * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each individually * Easier suppression of many errors at once -* Ensure validator runs on signals that require AWS credentials (iterate) * Use known erroneous/anomalous days of source data to tune static thresholds and test behavior +* Ensure validator runs on signals that require AWS credentials (iterate) * Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive -* If can't get data from API, do we want to use substitute data for the comparative checks instead? E.g. most recent successful API pull -- might end up being a couple weeks older +* If can't get data from API, do we want to use substitute data for the comparative checks instead? + * E.g. most recent successful API pull -- might end up being a couple weeks older * Currently, any API fetch problems just doesn't do comparative checks at all. -* Potentially implement a check for erratic data sources that wrongly report all 0's (like the error with the Wisconsin data for the 10/26 forecasts) +* Check for erratic data sources that wrongly report all zeroes + * E.g. the error with the Wisconsin data for the 10/26 forecasts + * Wary of a purely static check for this + * Are there any geo regions where this might cause false positives? E.g. small counties or MSAs, certain signals (deaths, since it's << cases) + * This test is partially captured by checking avgs in source vs reference data, unless erroneous zeroes continue for more than a week + * Also partially captured by outlier checking. If zeroes aren't outliers, then it's hard to say that they're erroneous at all. From e29e8a74141d97d070252bc5e57b898d29eaed1a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 3 Nov 2020 19:17:46 -0500 Subject: [PATCH 138/151] update plans --- validator/PLANS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index c0df0412e..966904463 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -36,7 +36,7 @@ ## Checks + features wishlist, and problems to think about: -* Improve performance and reduce runtime (what's the target time?) +* Improve performance and reduce runtime (what's the target time? Just want to not be painfully slow...) * Profiling (iterate) * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section. Parallelize?) * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance or may just make access more readable. From 06ba36c97ee861750f1c51c1874bf144c5f7bc97 Mon Sep 17 00:00:00 2001 From: Jed Grabman Date: Wed, 4 Nov 2020 14:52:39 -0500 Subject: [PATCH 139/151] Pass through filename for error message Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- validator/delphi_validator/validate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index d616dafd0..299cf48be 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -245,7 +245,7 @@ def check_df_format(self, df_to_test, nameformat): self.increment_total_checks() - def check_bad_geo_id_value(self, df_to_test, geo_type): + def check_bad_geo_id_value(self, df_to_test, filename, geo_type): """ Check for bad geo_id values, by comparing to a list of known values (drawn from historical data) @@ -740,7 +740,7 @@ def validate(self, export_dir): self.check_df_format(data_df, filename) self.check_bad_geo_id_format( data_df, filename, match.groupdict()['geo_type']) - self.check_bad_geo_id_value(data_df, match.groupdict()['geo_type']) + self.check_bad_geo_id_value(data_df, filename, match.groupdict()['geo_type']) self.check_bad_val(data_df, filename, match.groupdict()['signal']) self.check_bad_se(data_df, filename) self.check_bad_sample_size(data_df, filename) @@ -960,4 +960,4 @@ def exit(self): if len(subset_raised_errors) != 0: sys.exit(1) else: - sys.exit(0) \ No newline at end of file + sys.exit(0) From 373af740c3fe19b4ea4e160e6778a982264cd40d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 4 Nov 2020 17:05:59 -0500 Subject: [PATCH 140/151] update plans. Create starter-issue section --- validator/PLANS.md | 50 ++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 966904463..941561d7a 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -34,17 +34,38 @@ * User can manually disable specific checks for specific datasets using a field in the params.json file * User can enable test mode (checks only a small number of data files) using a field in the params.json file -## Checks + features wishlist, and problems to think about: +## Checks + features wishlist, and problems to think about + +### Starter/small issues -* Improve performance and reduce runtime (what's the target time? Just want to not be painfully slow...) - * Profiling (iterate) - * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section. Parallelize?) - * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance or may just make access more readable. * Check for duplicate rows -* Check explicitly for large spikes (avg_val check can detect jumps in average value) * Backfill problems, especially with JHU and USA Facts, where a change to old data results in a datapoint that doesn’t agree with surrounding data ([JHU examples](https://delphi-org.slack.com/archives/CF9G83ZJ9/p1600729151013900)) or is very different from the value it replaced. If date is already in the API, have any values changed significantly within the "backfill" window (use span_length setting). See [this](https://github.com/cmu-delphi/covidcast-indicators/pull/155#discussion_r504195207) for context. * Run check_missing_date_files (or similar) on every geo type-signal type separately in comparative checks loop. -* Different test thresholds for different files? Currently some control based on smoothed vs raw signals + +### Larger issues + +* Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive +* Check for erratic data sources that wrongly report all zeroes + * E.g. the error with the Wisconsin data for the 10/26 forecasts + * Wary of a purely static check for this + * Are there any geo regions where this might cause false positives? E.g. small counties or MSAs, certain signals (deaths, since it's << cases) + * This test is partially captured by checking avgs in source vs reference data, unless erroneous zeroes continue for more than a week + * Also partially captured by outlier checking. If zeroes aren't outliers, then it's hard to say that they're erroneous at all. +* Easier suppression of many errors at once +* Nicer formatting for error “report”. + * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each individually +* Use known erroneous/anomalous days of source data to tune static thresholds and test behavior +* If can't get data from API, do we want to use substitute data for the comparative checks instead? + * E.g. most recent successful API pull -- might end up being a couple weeks older + * Currently, any API fetch problems just doesn't do comparative checks at all. +* Improve performance and reduce runtime (no particular goal, just avoid being painfully slow!) + * Profiling (iterate) + * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section. Parallelize?) + * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance or may just make access more readable. +* Ensure validator runs on signals that require AWS credentials (iterate) + +### Longer-term issues + * Data correctness and consistency over longer time periods (weeks to months). Compare data against long-ago (3 months?) API data for changes in trends. * Long-term trends and correlations between time series. Currently, checks only look at a data window of a few days * Any relevant anomaly detection packages already exist? @@ -57,18 +78,3 @@ * Raise errors when one p-value (per geo region, e.g.) is significant OR when a bunch of p-values for that same type of test (different geo regions, e.g.) are "close" to significant * Correct p-values for multiple testing * Bonferroni would be easy but is sensitive to choice of "family" of tests; Benjamimi-Hochberg is a bit more involved but is less sensitive to choice of "family"; [comparison of the two](https://delphi-org.slack.com/archives/D01A9KNTPKL/p1603294915000500) -* Nicer formatting for error “report”. - * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each individually -* Easier suppression of many errors at once -* Use known erroneous/anomalous days of source data to tune static thresholds and test behavior -* Ensure validator runs on signals that require AWS credentials (iterate) -* Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive -* If can't get data from API, do we want to use substitute data for the comparative checks instead? - * E.g. most recent successful API pull -- might end up being a couple weeks older - * Currently, any API fetch problems just doesn't do comparative checks at all. -* Check for erratic data sources that wrongly report all zeroes - * E.g. the error with the Wisconsin data for the 10/26 forecasts - * Wary of a purely static check for this - * Are there any geo regions where this might cause false positives? E.g. small counties or MSAs, certain signals (deaths, since it's << cases) - * This test is partially captured by checking avgs in source vs reference data, unless erroneous zeroes continue for more than a week - * Also partially captured by outlier checking. If zeroes aren't outliers, then it's hard to say that they're erroneous at all. From 61b0b99da919784d7d708a1c635b8217d44b1a9f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 4 Nov 2020 17:09:07 -0500 Subject: [PATCH 141/151] update plans --- validator/PLANS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/validator/PLANS.md b/validator/PLANS.md index 941561d7a..0dd9541d3 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -52,6 +52,7 @@ * This test is partially captured by checking avgs in source vs reference data, unless erroneous zeroes continue for more than a week * Also partially captured by outlier checking. If zeroes aren't outliers, then it's hard to say that they're erroneous at all. * Easier suppression of many errors at once + * Maybe store errors as dict of dicts. Keys could be check strings (e.g. "check_bad_se"), then next layer geo type, etc * Nicer formatting for error “report”. * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each individually * Use known erroneous/anomalous days of source data to tune static thresholds and test behavior From 82feea88b52f29ef2100d283f2c5a531d338a196 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 5 Nov 2020 10:23:52 -0500 Subject: [PATCH 142/151] update plans --- validator/PLANS.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 0dd9541d3..19e5ee72d 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -44,17 +44,18 @@ ### Larger issues -* Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive +* Improve errors and error reports + * Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive + * Easier suppression of many errors at once + * Maybe store errors as dict of dicts. Keys could be check strings (e.g. "check_bad_se"), then next layer geo type, etc + * Nicer formatting for error “report”. + * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each individually * Check for erratic data sources that wrongly report all zeroes * E.g. the error with the Wisconsin data for the 10/26 forecasts * Wary of a purely static check for this * Are there any geo regions where this might cause false positives? E.g. small counties or MSAs, certain signals (deaths, since it's << cases) * This test is partially captured by checking avgs in source vs reference data, unless erroneous zeroes continue for more than a week * Also partially captured by outlier checking. If zeroes aren't outliers, then it's hard to say that they're erroneous at all. -* Easier suppression of many errors at once - * Maybe store errors as dict of dicts. Keys could be check strings (e.g. "check_bad_se"), then next layer geo type, etc -* Nicer formatting for error “report”. - * E.g. if a single type of error is raised for many different datasets, summarize all error messages into a single message? But it still has to be clear how to suppress each individually * Use known erroneous/anomalous days of source data to tune static thresholds and test behavior * If can't get data from API, do we want to use substitute data for the comparative checks instead? * E.g. most recent successful API pull -- might end up being a couple weeks older From 8984aac1410311c662972c35cb873be9d2a25a43 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 5 Nov 2020 10:29:03 -0500 Subject: [PATCH 143/151] update plans --- validator/PLANS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 19e5ee72d..f47393414 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -44,7 +44,7 @@ ### Larger issues -* Improve errors and error reports +* Improve errors and error report * Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive * Easier suppression of many errors at once * Maybe store errors as dict of dicts. Keys could be check strings (e.g. "check_bad_se"), then next layer geo type, etc From 538d98ee65116b27f93605acdd8194854b009313 Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Thu, 5 Nov 2020 12:27:14 -0500 Subject: [PATCH 144/151] Add file for national values --- validator/scripts/unique_geoids.R | 7 +++++-- validator/static/national_geo.csv | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 validator/static/national_geo.csv diff --git a/validator/scripts/unique_geoids.R b/validator/scripts/unique_geoids.R index fb4044a56..52edec904 100644 --- a/validator/scripts/unique_geoids.R +++ b/validator/scripts/unique_geoids.R @@ -8,5 +8,8 @@ for(type in geo_types){ } dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma") -file_name = paste0("../static/dma_geo.csv") -write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") \ No newline at end of file +file_name = "../static/dma_geo.csv" +write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") + +national_file = "../static/national_geo.csv" +write.table("us", file = national_file, row.names = F, col.names = "geo_id") diff --git a/validator/static/national_geo.csv b/validator/static/national_geo.csv new file mode 100644 index 000000000..f445fd82d --- /dev/null +++ b/validator/static/national_geo.csv @@ -0,0 +1,2 @@ +"geo_id" +"us" From 45046cbc7c4e46ecb7abcc2d09c9b48a519bd5cf Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Thu, 5 Nov 2020 12:54:57 -0500 Subject: [PATCH 145/151] Allow uppercase version of geo_id, but with warning --- validator/delphi_validator/validate.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 299cf48be..ea7350b82 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -256,11 +256,18 @@ def check_bad_geo_id_value(self, df_to_test, filename, geo_type): file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv') valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str}) valid_geos = valid_geo_df['geo_id'].values - unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo not in valid_geos] + unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() not in valid_geos] if len(unexpected_geos) > 0: self.raised_errors.append(ValidationError( ("check_bad_geo_id_value", filename), unexpected_geos, "Unrecognized geo_ids (not in historical data)")) + self.increment_total_checks() + upper_case_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() != geo] + if len(upper_case_geos) > 0: + self.raised_warnings.append(ValidationError( + ("check_geo_id_lowercase", filename), + upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred.")) + self.increment_total_checks() def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): """ From ad32bd3b156c153261f3f26a11d91e65518caf88 Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Thu, 5 Nov 2020 12:55:16 -0500 Subject: [PATCH 146/151] Add tests for geo_id value checks --- validator/tests/test_checks.py | 97 +++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 8 deletions(-) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index c8e199f2d..fd7250a62 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -146,21 +146,21 @@ def test_expected_groups(self): assert pattern_found["signal"] == "signal_signal" -class TestCheckBadGeoId: +class TestCheckBadGeoIdFormat: params = {"data_source": "", "span_length": 0, "end_date": "2020-09-02", "expected_lag": {}} def test_empty_df(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) - validator.check_bad_geo_id(empty_df, "name", "county") + validator.check_bad_geo_id_format(empty_df, "name", "county") assert len(validator.raised_errors) == 0 def test_invalid_geo_type(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) - validator.check_bad_geo_id(empty_df, "name", "hello") + validator.check_bad_geo_id_format(empty_df, "name", "hello") assert len(validator.raised_errors) == 1 assert "check_geo_type" in [ @@ -173,7 +173,7 @@ def test_invalid_geo_id_county(self): validator = Validator(self.params) df = pd.DataFrame(["0", "54321", "123", ".0000", "abc12"], columns=["geo_id"]) - validator.check_bad_geo_id(df, "name", "county") + validator.check_bad_geo_id_format(df, "name", "county") assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id @@ -184,7 +184,7 @@ def test_invalid_geo_id_msa(self): validator = Validator(self.params) df = pd.DataFrame(["0", "54321", "123", ".0000", "abc12"], columns=["geo_id"]) - validator.check_bad_geo_id(df, "name", "msa") + validator.check_bad_geo_id_format(df, "name", "msa") assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id @@ -195,7 +195,7 @@ def test_invalid_geo_id_hrr(self): validator = Validator(self.params) df = pd.DataFrame(["1", "12", "123", "1234", "12345", "a", ".", "ab1"], columns=["geo_id"]) - validator.check_bad_geo_id(df, "name", "hrr") + validator.check_bad_geo_id_format(df, "name", "hrr") assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id @@ -208,7 +208,7 @@ def test_invalid_geo_id_state(self): validator = Validator(self.params) df = pd.DataFrame(["aa", "hi", "HI", "hawaii", "Hawaii", "a", "H.I."], columns=["geo_id"]) - validator.check_bad_geo_id(df, "name", "state") + validator.check_bad_geo_id_format(df, "name", "state") assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id @@ -221,7 +221,7 @@ def test_invalid_geo_id_national(self): validator = Validator(self.params) df = pd.DataFrame(["usa", "SP", " us", "us", "usausa", "US"], columns=["geo_id"]) - validator.check_bad_geo_id(df, "name", "national") + validator.check_bad_geo_id_format(df, "name", "national") assert len(validator.raised_errors) == 1 assert "check_geo_id_format" in validator.raised_errors[0].check_data_id @@ -230,6 +230,87 @@ def test_invalid_geo_id_national(self): assert "US" not in validator.raised_errors[0].expression assert "SP" not in validator.raised_errors[0].expression +class TestCheckBadGeoIdValue: + params = {"data_source": "", "span_length": 0, + "end_date": "2020-09-02", "expected_lag": {}} + + def test_empty_df(self): + validator = Validator(self.params) + empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) + validator.check_bad_geo_id_value(empty_df, "name", "county") + assert len(validator.raised_errors) == 0 + + def test_invalid_geo_id_county(self): + validator = Validator(self.params) + df = pd.DataFrame(["01001", "88888", "99999"], columns=["geo_id"]) + validator.check_bad_geo_id_value(df, "name", "county") + + assert len(validator.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 2 + assert "01001" not in validator.raised_errors[0].expression + assert "88888" in validator.raised_errors[0].expression + assert "99999" in validator.raised_errors[0].expression + + def test_invalid_geo_id_msa(self): + validator = Validator(self.params) + df = pd.DataFrame(["10180", "88888", "99999"], columns=["geo_id"]) + validator.check_bad_geo_id_value(df, "name", "msa") + + assert len(validator.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 2 + assert "10180" not in validator.raised_errors[0].expression + assert "88888" in validator.raised_errors[0].expression + assert "99999" in validator.raised_errors[0].expression + + def test_invalid_geo_id_hrr(self): + validator = Validator(self.params) + df = pd.DataFrame(["1", "11", "111", "8", "88", "888"], columns=["geo_id"]) + validator.check_bad_geo_id_value(df, "name", "hrr") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_value" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 3 + assert "1" not in validator.raised_errors[0].expression + assert "11" not in validator.raised_errors[0].expression + assert "111" not in validator.raised_errors[0].expression + assert "8" in validator.raised_errors[0].expression + assert "88" in validator.raised_errors[0].expression + assert "888" in validator.raised_errors[0].expression + + def test_invalid_geo_id_state(self): + validator = Validator(self.params) + df = pd.DataFrame(["aa", "ak"], columns=["geo_id"]) + validator.check_bad_geo_id_value(df, "name", "state") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_value" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 1 + assert "ak" not in validator.raised_errors[0].expression + assert "aa" in validator.raised_errors[0].expression + + def test_uppercase_geo_id(self): + validator = Validator(self.params) + df = pd.DataFrame(["ak", "AK"], columns=["geo_id"]) + validator.check_bad_geo_id_value(df, "name", "state") + + assert len(validator.raised_errors) == 0 + assert len(validator.raised_warnings) == 1 + assert "check_geo_id_lowercase" in validator.raised_warnings[0].check_data_id + assert "AK" in validator.raised_warnings[0].expression + + def test_invalid_geo_id_national(self): + validator = Validator(self.params) + df = pd.DataFrame(["us", "zz"], columns=["geo_id"]) + validator.check_bad_geo_id_value(df, "name", "national") + + assert len(validator.raised_errors) == 1 + assert "check_geo_id_value" in validator.raised_errors[0].check_data_id + assert len(validator.raised_errors[0].expression) == 1 + assert "us" not in validator.raised_errors[0].expression + assert "zz" in validator.raised_errors[0].expression + class TestCheckBadVal: params = {"data_source": "", "span_length": 1, From 42ad65eb0900e1664459833412c3cc99212c243d Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 5 Nov 2020 17:49:51 -0500 Subject: [PATCH 147/151] Add small error-message changes to tests --- validator/tests/test_checks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index fd7250a62..003e7db10 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -148,7 +148,8 @@ def test_expected_groups(self): class TestCheckBadGeoIdFormat: params = {"data_source": "", "span_length": 0, - "end_date": "2020-09-02", "expected_lag": {}} + "end_date": "2020-09-02", "expected_lag": {}, + "validator_static_file_dir": "../static"} def test_empty_df(self): validator = Validator(self.params) @@ -270,7 +271,7 @@ def test_invalid_geo_id_hrr(self): validator.check_bad_geo_id_value(df, "name", "hrr") assert len(validator.raised_errors) == 1 - assert "check_geo_id_value" in validator.raised_errors[0].check_data_id + assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id assert len(validator.raised_errors[0].expression) == 3 assert "1" not in validator.raised_errors[0].expression assert "11" not in validator.raised_errors[0].expression @@ -285,7 +286,7 @@ def test_invalid_geo_id_state(self): validator.check_bad_geo_id_value(df, "name", "state") assert len(validator.raised_errors) == 1 - assert "check_geo_id_value" in validator.raised_errors[0].check_data_id + assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id assert len(validator.raised_errors[0].expression) == 1 assert "ak" not in validator.raised_errors[0].expression assert "aa" in validator.raised_errors[0].expression @@ -306,7 +307,7 @@ def test_invalid_geo_id_national(self): validator.check_bad_geo_id_value(df, "name", "national") assert len(validator.raised_errors) == 1 - assert "check_geo_id_value" in validator.raised_errors[0].check_data_id + assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id assert len(validator.raised_errors[0].expression) == 1 assert "us" not in validator.raised_errors[0].expression assert "zz" in validator.raised_errors[0].expression From fb1537349fb9204fa9ca54f1263b3208e56bb3d0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 6 Nov 2020 10:47:35 -0500 Subject: [PATCH 148/151] fix tests --- validator/delphi_validator/validate.py | 20 ++++++++++++-------- validator/tests/test_checks.py | 12 ++++++------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index ea7350b82..bb0083687 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -113,7 +113,8 @@ def __init__(self, params): """ # Get user settings from params or if not provided, set default. self.data_source = params['data_source'] - self.validator_static_file_dir = params.get('validator_static_file_dir', '../validator/static') + self.validator_static_file_dir = params.get( + 'validator_static_file_dir', '../validator/static') # Date/time settings self.span_length = timedelta(days=params['span_length']) @@ -254,19 +255,21 @@ def check_bad_geo_id_value(self, df_to_test, filename, geo_type): - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data """ file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv') - valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str}) + valid_geo_df = pd.read_csv(file_path, dtype={'geo_id': str}) valid_geos = valid_geo_df['geo_id'].values - unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() not in valid_geos] + unexpected_geos = [geo for geo in df_to_test['geo_id'] + if geo.lower() not in valid_geos] if len(unexpected_geos) > 0: self.raised_errors.append(ValidationError( ("check_bad_geo_id_value", filename), unexpected_geos, "Unrecognized geo_ids (not in historical data)")) self.increment_total_checks() - upper_case_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() != geo] + upper_case_geos = [ + geo for geo in df_to_test['geo_id'] if geo.lower() != geo] if len(upper_case_geos) > 0: self.raised_warnings.append(ValidationError( ("check_geo_id_lowercase", filename), - upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred.")) + upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred.")) self.increment_total_checks() def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): @@ -306,8 +309,8 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): if geo_type in fill_len.keys(): # Left-pad with zeroes up to expected length. Fixes missing leading zeroes # caused by FIPS codes saved as numeric. - df_to_test["geo_id"] = [geo.zfill(fill_len[geo_type]) - for geo in df_to_test["geo_id"]] + df_to_test["geo_id"] = pd.Series([geo.zfill(fill_len[geo_type]) + for geo in df_to_test["geo_id"]], dtype=str) expected_geos = [geo[0] for geo in df_to_test['geo_id'].str.findall( geo_regex) if len(geo) > 0] @@ -747,7 +750,8 @@ def validate(self, export_dir): self.check_df_format(data_df, filename) self.check_bad_geo_id_format( data_df, filename, match.groupdict()['geo_type']) - self.check_bad_geo_id_value(data_df, filename, match.groupdict()['geo_type']) + self.check_bad_geo_id_value( + data_df, filename, match.groupdict()['geo_type']) self.check_bad_val(data_df, filename, match.groupdict()['signal']) self.check_bad_se(data_df, filename) self.check_bad_sample_size(data_df, filename) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index d73d57dc7..04ee98c71 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -6,8 +6,6 @@ from delphi_validator.datafetcher import filename_regex from delphi_validator.validate import Validator, make_date_filter -import pdb - class TestDateFilter: @@ -148,8 +146,7 @@ def test_expected_groups(self): class TestCheckBadGeoIdFormat: params = {"data_source": "", "span_length": 0, - "end_date": "2020-09-02", "expected_lag": {}, - "validator_static_file_dir": "../static"} + "end_date": "2020-09-02", "expected_lag": {}} def test_empty_df(self): validator = Validator(self.params) @@ -231,9 +228,11 @@ def test_invalid_geo_id_national(self): assert "US" not in validator.raised_errors[0].expression assert "SP" not in validator.raised_errors[0].expression + class TestCheckBadGeoIdValue: params = {"data_source": "", "span_length": 0, - "end_date": "2020-09-02", "expected_lag": {}} + "end_date": "2020-09-02", "expected_lag": {}, + "validator_static_file_dir": "../static"} def test_empty_df(self): validator = Validator(self.params) @@ -267,7 +266,8 @@ def test_invalid_geo_id_msa(self): def test_invalid_geo_id_hrr(self): validator = Validator(self.params) - df = pd.DataFrame(["1", "11", "111", "8", "88", "888"], columns=["geo_id"]) + df = pd.DataFrame(["1", "11", "111", "8", "88", + "888"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "hrr") assert len(validator.raised_errors) == 1 From 22e88e2b193df340724055216fe747600bd6942e Mon Sep 17 00:00:00 2001 From: JedGrabman Date: Mon, 9 Nov 2020 13:35:57 -0500 Subject: [PATCH 149/151] Make geo_id checks robust Automatically determine signals and data sources to use for retrieving geo_values. This adds robustness at the cost of efficiency. --- validator/scripts/unique_geoids.R | 23 ++++---- validator/static/county_geo.csv | 89 +++++++++++++++++++++++++++++++ validator/static/msa_geo.csv | 11 ++++ validator/static/state_geo.csv | 4 ++ 4 files changed, 117 insertions(+), 10 deletions(-) diff --git a/validator/scripts/unique_geoids.R b/validator/scripts/unique_geoids.R index 52edec904..676223be3 100644 --- a/validator/scripts/unique_geoids.R +++ b/validator/scripts/unique_geoids.R @@ -1,15 +1,18 @@ library(covidcast) +library(dplyr) +meta_info = covidcast_meta() +locations_by_type = meta_info %>% group_by(geo_type) %>% summarize(Value = max(num_locations)) -geo_types = c("county", "state", "hrr", "msa") -for(type in geo_types){ - dtf = covidcast_signal("indicator-combination", "confirmed_7dav_incidence_num", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = type) +results = list() +for (i in 1:nrow(locations_by_type)){ + type = locations_by_type$geo_type[i] + max_locations = locations_by_type$Value[i] + max_row = with(meta_info, meta_info[geo_type == type & num_locations == max_locations,][1,]) + data_source = max_row$data_source + signal = max_row$signal + results[[i]] = covidcast_signal(data_source, signal, geo_type = type) + geo_values = sort(unique(results[[i]]$geo_value)) file_name = paste0("../static/", type, "_geo.csv") - write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") + write.table(geo_values, file = file_name, row.names = F, col.names = "geo_id") } -dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma") -file_name = "../static/dma_geo.csv" -write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id") - -national_file = "../static/national_geo.csv" -write.table("us", file = national_file, row.names = F, col.names = "geo_id") diff --git a/validator/static/county_geo.csv b/validator/static/county_geo.csv index a6127cf9b..3812b9693 100644 --- a/validator/static/county_geo.csv +++ b/validator/static/county_geo.csv @@ -94,6 +94,7 @@ "02230" "02240" "02261" +"02270" "02275" "02282" "02290" @@ -326,6 +327,7 @@ "10001" "10003" "10005" +"11000" "11001" "12000" "12001" @@ -2458,6 +2460,7 @@ "46107" "46109" "46111" +"46113" "46115" "46117" "46119" @@ -3191,4 +3194,90 @@ "56041" "56043" "56045" +"60000" +"66000" +"69000" +"70002" +"70003" "72000" +"72001" +"72003" +"72005" +"72007" +"72009" +"72011" +"72013" +"72015" +"72017" +"72019" +"72021" +"72023" +"72025" +"72027" +"72029" +"72031" +"72033" +"72035" +"72037" +"72039" +"72041" +"72043" +"72045" +"72047" +"72049" +"72051" +"72053" +"72054" +"72055" +"72057" +"72059" +"72061" +"72063" +"72065" +"72067" +"72069" +"72071" +"72073" +"72075" +"72077" +"72079" +"72081" +"72083" +"72085" +"72087" +"72089" +"72091" +"72093" +"72095" +"72097" +"72099" +"72101" +"72103" +"72105" +"72107" +"72109" +"72111" +"72113" +"72115" +"72117" +"72119" +"72121" +"72123" +"72125" +"72127" +"72129" +"72131" +"72133" +"72135" +"72137" +"72139" +"72141" +"72143" +"72145" +"72147" +"72149" +"72151" +"72153" +"72888" +"72999" +"78000" diff --git a/validator/static/msa_geo.csv b/validator/static/msa_geo.csv index 9025de71a..a8d1043d6 100644 --- a/validator/static/msa_geo.csv +++ b/validator/static/msa_geo.csv @@ -1,5 +1,6 @@ "geo_id" "10180" +"10380" "10420" "10500" "10540" @@ -14,6 +15,7 @@ "11460" "11500" "11540" +"11640" "11700" "12020" "12060" @@ -144,11 +146,13 @@ "24660" "24780" "24860" +"25020" "25060" "25180" "25220" "25260" "25420" +"25500" "25540" "25620" "25860" @@ -175,6 +179,7 @@ "27780" "27860" "27900" +"27980" "28020" "28100" "28140" @@ -218,6 +223,7 @@ "31740" "31860" "31900" +"32420" "32580" "32780" "32820" @@ -272,6 +278,7 @@ "38300" "38340" "38540" +"38660" "38860" "38900" "38940" @@ -309,7 +316,9 @@ "41700" "41740" "41860" +"41900" "41940" +"41980" "42020" "42100" "42140" @@ -334,6 +343,7 @@ "44180" "44220" "44300" +"44420" "44700" "44940" "45060" @@ -376,6 +386,7 @@ "49180" "49340" "49420" +"49500" "49620" "49660" "49700" diff --git a/validator/static/state_geo.csv b/validator/static/state_geo.csv index e4d129ad6..8bba20eac 100644 --- a/validator/static/state_geo.csv +++ b/validator/static/state_geo.csv @@ -2,6 +2,7 @@ "ak" "al" "ar" +"as" "az" "ca" "co" @@ -10,6 +11,7 @@ "de" "fl" "ga" +"gu" "hi" "ia" "id" @@ -24,6 +26,7 @@ "mi" "mn" "mo" +"mp" "ms" "mt" "nc" @@ -46,6 +49,7 @@ "tx" "ut" "va" +"vi" "vt" "wa" "wi" From b7001384b781f777f4e12126a8a17f51bdc43128 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 9 Nov 2020 18:41:34 -0500 Subject: [PATCH 150/151] update plans with feedback from modeling meeting --- validator/PLANS.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index f47393414..3fb290022 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -6,7 +6,7 @@ * Recognized file name format * Recognized geographical type (county, state, etc) * Recognized geo id format (e.g. state is two lowercase letters) -* Geo id has been seen before, in historical data +* Specific geo id has been seen before, in historical data * Missing geo type + signal + date combos based on the geo type + signal combos Covidcast metadata says should be available * Missing ‘val’ values * Negative ‘val’ values @@ -44,6 +44,9 @@ ### Larger issues +* Expand framework to support nchs_mortality, which is provided on a weekly basis and has some differences from the daily data. E.g. filenames use a different format ("weekly_YYYYWW_geotype_signalname.csv") +* Make backtesting framework so new checks can be run individually on historical indicator data to tune false positives, output verbosity, understand frequency of error raising, etc. Should pull data from API the first time and save locally in `cache` dir. +* Add DETAILS.md doc with detailed descriptions of what each check does and how. Will be especially important for statistical/anomaly detection checks. * Improve errors and error report * Check if [errors raised from validating all signals](https://docs.google.com/spreadsheets/d/1_aRBDrNeaI-3ZwuvkRNSZuZ2wfHJk6Bxj35Ol_XZ9yQ/edit#gid=1226266834) are correct, not false positives, not overly verbose or repetitive * Easier suppression of many errors at once @@ -62,7 +65,8 @@ * Currently, any API fetch problems just doesn't do comparative checks at all. * Improve performance and reduce runtime (no particular goal, just avoid being painfully slow!) * Profiling (iterate) - * Check if saving intermediate files will improve efficiency (currently a bottleneck at "individual file checks" section. Parallelize?) + * Save intermediate files? + * Currently a bottleneck at "individual file checks" section. Parallelize? * Make `all_frames` MultiIndex-ed by geo type and signal name? Make a dict of data indexed by geo type and signal name? May improve performance or may just make access more readable. * Ensure validator runs on signals that require AWS credentials (iterate) From e2c86a3f1b3cb79239cacb9d4dfbb55d464d28cb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 18 Nov 2020 16:29:05 -0500 Subject: [PATCH 151/151] update plans, small clean-up items --- validator/PLANS.md | 3 +++ validator/delphi_validator/datafetcher.py | 1 - validator/delphi_validator/validate.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/validator/PLANS.md b/validator/PLANS.md index 3fb290022..531d62112 100644 --- a/validator/PLANS.md +++ b/validator/PLANS.md @@ -59,6 +59,9 @@ * Are there any geo regions where this might cause false positives? E.g. small counties or MSAs, certain signals (deaths, since it's << cases) * This test is partially captured by checking avgs in source vs reference data, unless erroneous zeroes continue for more than a week * Also partially captured by outlier checking. If zeroes aren't outliers, then it's hard to say that they're erroneous at all. +* Outlier detection (in progress) + * Current approach is tuned to daily cases and daily deaths; use just on those signals? + * prophet (package) detection is flexible, but needs 2-3 months historical data to fit on. May make sense to use if other statistical checks also need that much data. * Use known erroneous/anomalous days of source data to tune static thresholds and test behavior * If can't get data from API, do we want to use substitute data for the comparative checks instead? * E.g. most recent successful API pull -- might end up being a couple weeks older diff --git a/validator/delphi_validator/datafetcher.py b/validator/delphi_validator/datafetcher.py index 3fe71d4e9..b920259e4 100644 --- a/validator/delphi_validator/datafetcher.py +++ b/validator/delphi_validator/datafetcher.py @@ -6,7 +6,6 @@ import re from os import listdir from os.path import isfile, join -from datetime import datetime from itertools import product import pandas as pd import numpy as np diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index bb0083687..448a3a847 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -248,7 +248,8 @@ def check_df_format(self, df_to_test, nameformat): def check_bad_geo_id_value(self, df_to_test, filename, geo_type): """ - Check for bad geo_id values, by comparing to a list of known values (drawn from historical data) + Check for bad geo_id values, by comparing to a list of known values (drawn from + historical data) Arguments: - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check