diff --git a/validator/delphi_validator/report.py b/validator/delphi_validator/report.py new file mode 100644 index 000000000..89af69700 --- /dev/null +++ b/validator/delphi_validator/report.py @@ -0,0 +1,98 @@ +"""Validation output reports.""" +import sys +from datetime import date, datetime +from typing import List, Tuple + +class ValidationReport: + """Class for reporting the results of validation.""" + def __init__(self, errors_to_suppress: List[Tuple[str]]): + """Initialize a ValidationReport. + Parameters + ---------- + errors_to_suppress: List[Tuple[str]] + List of error identifications to ignore. + + Attributes + ---------- + errors_to_suppress: List[Tuple[str]] + See above + num_suppressed: int + Number of errors suppressed + total_checks: int + Number of validation checks performed + raised_errors: List[Exception] + Errors raised from validation failures + raised_warnings: List[Exception] + Warnings raised from validation execution + unsuppressed_errors: List[Exception] + Errors raised from validation failures not found in `self.errors_to_suppress` + """ + self.errors_to_suppress = errors_to_suppress.copy() + self.num_suppressed = 0 + self.total_checks = 0 + self.raised_errors = [] + self.raised_warnings = [] + self.unsuppressed_errors = [] + + def add_raised_error(self, error): + """Add an error to the report. + Parameters + ---------- + error: Exception + Error raised in validation + + Returns + ------- + None + """ + self.raised_errors.append(error) + # Convert any dates in check_data_id to strings for the purpose of comparing + # to manually suppressed errors. + raised_check_id = tuple([ + item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) + else item for item in error.check_data_id]) + + if raised_check_id in self.errors_to_suppress: + self.errors_to_suppress.remove(raised_check_id) + self.num_suppressed += 1 + else: + self.unsuppressed_errors.append(error) + + def increment_total_checks(self): + """Records a check.""" + self.total_checks += 1 + + def add_raised_warning(self, warning): + """Add a warning to the report. + Parameters + ---------- + warning: Warning + Warning raised in validation + + Returns + ------- + None + """ + self.raised_warnings.append(warning) + + def __str__(self): + """String representation of report.""" + out_str = f"{self.total_checks} checks run\n" + out_str += f"{len(self.unsuppressed_errors)} checks failed\n" + out_str += f"{self.num_suppressed} checks suppressed\n" + out_str += f"{len(self.raised_warnings)} warnings\n" + for message in self.unsuppressed_errors: + out_str += f"{message}\n" + for message in self.raised_warnings: + out_str += f"{message}\n" + return out_str + + def print_and_exit(self): + """ + Print results and, if any not-suppressed exceptions were raised, exit with non-zero status. + """ + print(self) + if len(self.unsuppressed_errors) != 0: + sys.exit(1) + else: + sys.exit(0) diff --git a/validator/delphi_validator/run.py b/validator/delphi_validator/run.py index ec72f14f0..ed6236b34 100644 --- a/validator/delphi_validator/run.py +++ b/validator/delphi_validator/run.py @@ -14,4 +14,4 @@ def run_module(): params = parent_params['validation'] validator = Validator(params) - validator.validate(parent_params["export_dir"]) + validator.validate(parent_params["export_dir"]).print_and_exit() diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py index 917201210..0ffd2d3f2 100644 --- a/validator/delphi_validator/validate.py +++ b/validator/delphi_validator/validate.py @@ -2,7 +2,6 @@ """ Tools to validate CSV source data, including various check methods. """ -import sys import re import math from os.path import join @@ -11,6 +10,7 @@ from .errors import ValidationError, APIDataFetchError from .datafetcher import FILENAME_REGEX, get_geo_signal_combos, threaded_api_calls, load_all_files from .utils import GEO_REGEX_DICT, relative_difference_by_min, aggregate_frames +from .report import ValidationReport class Validator(): """ Class containing validation() function and supporting functions. Stores a list @@ -44,10 +44,6 @@ def __init__(self, params): avg, etc) - expected_lag: dict of signal names: int pairs; how many days behind do we expect each signal to be - - suppressed_errors: set of check_data_ids used to identify error messages to ignore - - raised_errors: list to append data upload-blocking errors to as they are raised - - total_checks: incremental counter to track total number of checks run - - raised_warnings: list to append non-data upload-blocking errors to as they are raised """ # TODO(https://github.com/cmu-delphi/covidcast-indicators/issues/579) # Refactor this class to avoid the too-many-instance-attributes error. @@ -87,16 +83,9 @@ def __init__(self, params): self.suppressed_errors = {(item,) if not isinstance(item, tuple) and not isinstance( item, list) else tuple(item) for item in params.get('suppressed_errors', [])} - # Output - self.raised_errors = [] - self.total_checks = 0 - - self.raised_warnings = [] + self.active_report = ValidationReport(self.suppressed_errors) # pylint: enable=too-many-instance-attributes - def increment_total_checks(self): - """ Add 1 to total_checks counter """ - self.total_checks += 1 def check_missing_date_files(self, daily_filenames): """ @@ -106,6 +95,7 @@ def check_missing_date_files(self, daily_filenames): - daily_filenames: List[Tuple(str, re.match, pd.DataFrame)] triples of filenames, filename matches with the geo regex, and the data from the file + - report: ValidationReport; report where results are added Returns: - None @@ -124,13 +114,14 @@ def check_missing_date_files(self, daily_filenames): check_dateholes.sort() if check_dateholes: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( "check_missing_date_files", check_dateholes, "Missing dates are observed; if these dates are" + " already in the API they would not be updated")) - self.increment_total_checks() + self.active_report.increment_total_checks() + def check_df_format(self, df_to_test, nameformat): """ @@ -146,18 +137,18 @@ def check_df_format(self, df_to_test, nameformat): """ pattern_found = FILENAME_REGEX.match(nameformat) if not nameformat or not pattern_found: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_filename_format", nameformat), nameformat, 'nameformat not recognized')) - self.increment_total_checks() + self.active_report.increment_total_checks() if not isinstance(df_to_test, pd.DataFrame): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_file_data_format", nameformat), type(df_to_test), 'df_to_test must be a pandas dataframe.')) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_geo_id_value(self, df_to_test, filename, geo_type): """ @@ -167,24 +158,25 @@ def check_bad_geo_id_value(self, df_to_test, filename, geo_type): Arguments: - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data - """ + - report: ValidationReport; report where results are added + """ file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv') valid_geo_df = pd.read_csv(file_path, dtype={'geo_id': str}) valid_geos = valid_geo_df['geo_id'].values unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() not in valid_geos] if len(unexpected_geos) > 0: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_bad_geo_id_value", filename), unexpected_geos, "Unrecognized geo_ids (not in historical data)")) - self.increment_total_checks() + self.active_report.increment_total_checks() upper_case_geos = [ geo for geo in df_to_test['geo_id'] if geo.lower() != geo] if len(upper_case_geos) > 0: - self.raised_warnings.append(ValidationError( + self.active_report.add_raised_warning(ValidationError( ("check_geo_id_lowercase", filename), upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred.")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): """ @@ -193,6 +185,7 @@ def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type): Arguments: - df_to_test: pandas dataframe of CSV source data - geo_type: string from CSV name specifying geo type (state, county, msa, hrr) of data + - report: ValidationReport; report where results are added Returns: - None @@ -216,7 +209,7 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): df_to_test["geo_id"] = [geo[0] for geo in df_to_test["geo_id"].str.split(".")] - self.raised_warnings.append(ValidationError( + self.active_report.add_raised_warning(ValidationError( ("check_geo_id_type", nameformat), None, "geo_ids saved as floats; strings preferred")) @@ -233,19 +226,19 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type): df_to_test['geo_id']) if geo not in expected_geos} if len(unexpected_geos) > 0: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_geo_id_format", nameformat), unexpected_geos, "Non-conforming geo_ids found")) if geo_type not in GEO_REGEX_DICT: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_geo_type", nameformat), geo_type, "Unrecognized geo type")) else: find_all_unexpected_geo_ids( df_to_test, GEO_REGEX_DICT[geo_type], geo_type) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_val(self, df_to_test, nameformat, signal_type): """ @@ -254,6 +247,7 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): Arguments: - df_to_test: pandas dataframe of a single CSV of source data - signal_type: string from CSV name specifying signal type (smoothed_cli, etc) of data + - report: ValidationReport; report where results are added Returns: - None @@ -264,36 +258,36 @@ def check_bad_val(self, df_to_test, nameformat, signal_type): if percent_option: if not df_to_test[(df_to_test['val'] > 100)].empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_pct_gt_100", nameformat), df_to_test[(df_to_test['val'] > 100)], "val column can't have any cell greater than 100 for percents")) - self.increment_total_checks() + self.active_report.increment_total_checks() if proportion_option: if not df_to_test[(df_to_test['val'] > 100000)].empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_prop_gt_100k", nameformat), df_to_test[(df_to_test['val'] > 100000)], "val column can't have any cell greater than 100000 for proportions")) - self.increment_total_checks() + self.active_report.increment_total_checks() if df_to_test['val'].isnull().values.any(): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_missing", nameformat), None, "val column can't have any cell that is NA")) - self.increment_total_checks() + self.active_report.increment_total_checks() if not df_to_test[(df_to_test['val'] < 0)].empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_val_lt_0", nameformat), df_to_test[(df_to_test['val'] < 0)], "val column can't have any cell smaller than 0")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_bad_se(self, df_to_test, nameformat): """ @@ -320,35 +314,35 @@ def check_bad_se(self, df_to_test, nameformat): '~((se > 0) & (se < 50) & (se <= se_upper_limit))') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_not_missing_and_in_range", nameformat), result, "se must be in (0, min(50,val*(1+eps))] and not missing")) - self.increment_total_checks() + self.active_report.increment_total_checks() if df_to_test["se"].isnull().mean() > 0.5: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_many_missing", nameformat), None, 'Recent se values are >50% NA')) - self.increment_total_checks() + self.active_report.increment_total_checks() elif self.missing_se_allowed: result = df_to_test.query( '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_missing_or_in_range", nameformat), result, "se must be NA or in (0, min(50,val*(1+eps))]")) - self.increment_total_checks() + self.active_report.increment_total_checks() result_jeffreys = df_to_test.query('(val == 0) & (se == 0)') result_alt = df_to_test.query('se == 0') if not result_jeffreys.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_0_when_val_0", nameformat), None, "when signal value is 0, se must be non-zero. please " @@ -356,11 +350,11 @@ def check_bad_se(self, df_to_test, nameformat): + " (see wikipedia.org/wiki/Binomial_proportion_confidence" + "_interval#Jeffreys_interval for details)")) elif not result_alt.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_se_0", nameformat), result_alt, "se must be non-zero")) - self.increment_total_checks() + self.active_report.increment_total_checks() # Remove se_upper_limit column. df_to_test.drop(columns=["se_upper_limit"]) @@ -373,40 +367,41 @@ def check_bad_sample_size(self, df_to_test, nameformat): - df_to_test: pandas dataframe of a single CSV of source data (one day-signal-geo_type combo) - nameformat: str CSV name; for example, "20200624_county_smoothed_nohh_cmnty_cli.csv" + - report: ValidationReport; report where results are added Returns: - None """ if not self.missing_sample_size_allowed: if df_to_test['sample_size'].isnull().values.any(): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_n_missing", nameformat), None, "sample_size must not be NA")) - self.increment_total_checks() + self.active_report.increment_total_checks() # Find rows with sample size less than minimum allowed result = df_to_test.query( '(sample_size < @self.minimum_sample_size)') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_n_gt_min", nameformat), result, f"sample size must be >= {self.minimum_sample_size}")) - self.increment_total_checks() + self.active_report.increment_total_checks() elif self.missing_sample_size_allowed: result = df_to_test.query( '~(sample_size.isnull() | (sample_size >= @self.minimum_sample_size))') if not result.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_n_missing_or_gt_min", nameformat), result, f"sample size must be NA or >= {self.minimum_sample_size}")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_min_allowed_max_date(self, max_date, geo_type, signal_type): """ @@ -416,6 +411,7 @@ def check_min_allowed_max_date(self, max_date, geo_type, signal_type): - max_date: date of most recent data to be validated; datetime format. - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None @@ -425,12 +421,12 @@ def check_min_allowed_max_date(self, max_date, geo_type, signal_type): else 1) if max_date < self.generation_date - thres: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_min_max_date", geo_type, signal_type), max_date, "date of most recent generated file seems too long ago")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_max_allowed_max_date(self, max_date, geo_type, signal_type): """ @@ -440,17 +436,18 @@ def check_max_allowed_max_date(self, max_date, geo_type, signal_type): - max_date: date of most recent data to be validated; datetime format. - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None """ if max_date > self.generation_date: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_max_max_date", geo_type, signal_type), max_date, "date of most recent generated file seems too recent")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date, geo_type, signal_type): @@ -464,12 +461,13 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date COVIDcast API or semirecent data - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None """ if df_to_test["time_value"].max() < df_to_reference["time_value"].max(): - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_max_date_vs_reference", checking_date.date(), geo_type, signal_type), (df_to_test["time_value"].max(), @@ -480,7 +478,7 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date 'working files have already been compared against the reference, ' + 'that there is a bug somewhere')) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date, geo_type, signal_type): @@ -494,6 +492,7 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date - checking_date: datetime date - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added Returns: - None @@ -512,14 +511,14 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date raise e if abs(compare_rows) > 0.35: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_rapid_change_num_rows", checking_date, geo_type, signal_type), (test_rows_per_reporting_day, reference_rows_per_reporting_day), "Number of rows per day (-with-any-rows) seems to have changed " + "rapidly (reference vs test data)")) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_positive_negative_spikes(self, source_df, api_frames, geo, sig): """ @@ -539,7 +538,7 @@ def check_positive_negative_spikes(self, source_df, api_frames, geo, sig): - sig: str; signal name as in the CSV name """ - self.increment_total_checks() + self.active_report.increment_total_checks() # Combine all possible frames so that the rolling window calculations make sense. source_frame_start = source_df["time_value"].min() source_frame_end = source_df["time_value"].max() @@ -641,7 +640,7 @@ def outlier_nearby(frame): "time_value >= @source_frame_start & time_value <= @source_frame_end") if source_outliers.shape[0] > 0: - self.raised_errors.append(ValidationError( + self.active_report.raised_errors.append(ValidationError( ("check_positive_negative_spikes", source_frame_start, source_frame_end, geo, sig), (source_outliers), @@ -744,7 +743,7 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, mean_stdabsdiff_high = (df_all["mean_stdabsdiff"] > float(thres["mean_stdabsdiff"])).any() if mean_stddiff_high or mean_stdabsdiff_high: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_test_vs_reference_avg_changed", checking_date, geo_type, signal_type), (mean_stddiff_high, mean_stdabsdiff_high), @@ -754,17 +753,17 @@ def check_avg_val_vs_reference(self, df_to_test, df_to_reference, checking_date, + 'to average values of corresponding variables. For the former check, ' + 'tolerances for `val` are more restrictive than those for other columns.')) - self.increment_total_checks() + self.active_report.increment_total_checks() def check_duplicate_rows(self, data_df, filename): is_duplicate = data_df.duplicated() if (any(is_duplicate)): duplicate_row_idxs = list(data_df[is_duplicate].index) - self.raised_warnings.append(ValidationError( + self.active_report.add_raised_warning(ValidationError( ("check_duplicate_rows", filename), duplicate_row_idxs, "Some rows are duplicated, which may indicate data integrity issues")) - self.increment_total_checks() + self.active_report.increment_total_checks() def validate(self, export_dir): """ @@ -774,13 +773,14 @@ def validate(self, export_dir): - export_dir: path to data CSVs Returns: - - None + - ValidationReport collating the validation outcomes """ + self.active_report = ValidationReport(self.suppressed_errors) frames_list = load_all_files(export_dir, self.start_date, self.end_date) self._run_single_file_checks(frames_list) all_frames = aggregate_frames(frames_list) self._run_combined_file_checks(all_frames) - self.exit() + return self.active_report def _run_single_file_checks(self, file_list): """ @@ -850,10 +850,10 @@ def _run_combined_file_checks(self, all_frames): # Drop unused columns. geo_sig_df.drop(columns=["geo_type", "signal"]) - self.increment_total_checks() + self.active_report.increment_total_checks() if geo_sig_df.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_missing_geo_sig_combo", geo_type, signal_type), None, "file with geo_type-signal combo does not exist")) @@ -866,9 +866,9 @@ def _run_combined_file_checks(self, all_frames): # Get relevant reference data from API dictionary. api_df_or_error = all_api_df[(geo_type, signal_type)] - self.increment_total_checks() + self.active_report.increment_total_checks() if isinstance(api_df_or_error, APIDataFetchError): - self.raised_errors.append(api_df_or_error) + self.active_report.raised_errors.append(api_df_or_error) continue # Outlier dataframe @@ -900,10 +900,10 @@ def _run_combined_file_checks(self, all_frames): recent_df = geo_sig_df.query( 'time_value <= @checking_date & time_value >= @recent_cutoff_date') - self.increment_total_checks() + self.active_report.increment_total_checks() if recent_df.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("check_missing_geo_sig_date_combo", checking_date, geo_type, signal_type), None, @@ -927,10 +927,10 @@ def _run_combined_file_checks(self, all_frames): reference_api_df = api_df_or_error.query( "time_value >= @reference_start_date & time_value <= @reference_end_date") - self.increment_total_checks() + self.active_report.increment_total_checks() if reference_api_df.empty: - self.raised_errors.append(ValidationError( + self.active_report.add_raised_error(ValidationError( ("empty_reference_data", checking_date, geo_type, signal_type), None, "reference data is empty; comparative checks could not be performed")) @@ -951,38 +951,3 @@ def _run_combined_file_checks(self, all_frames): kroc += 1 if self.test_mode and kroc == 2: break - - def exit(self): - """ - If any not-suppressed exceptions were raised, print and exit with non-zero status. - """ - suppressed_counter = 0 - subset_raised_errors = [] - - for val_error in self.raised_errors: - # Convert any dates in check_data_id to strings for the purpose of comparing - # to manually suppressed errors. - raised_check_id = tuple([ - item.strftime("%Y-%m-%d") if isinstance(item, (date, datetime)) - else item for item in val_error.check_data_id]) - - if raised_check_id not in self.suppressed_errors: - subset_raised_errors.append(val_error) - else: - self.suppressed_errors.remove(raised_check_id) - suppressed_counter += 1 - - print(self.total_checks, "checks run") - print(len(subset_raised_errors), "checks failed") - print(suppressed_counter, "checks suppressed") - print(len(self.raised_warnings), "warnings") - - for message in subset_raised_errors: - print(message) - for message in self.raised_warnings: - print(message) - - if len(subset_raised_errors) != 0: - sys.exit(1) - else: - sys.exit(0) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py index 9cfb026a1..7e993dfdb 100644 --- a/validator/tests/test_checks.py +++ b/validator/tests/test_checks.py @@ -69,7 +69,7 @@ def test_default_settings(self): assert validator.sanity_check_value_diffs == True assert len(validator.suppressed_errors) == 0 assert isinstance(validator.suppressed_errors, set) - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 class TestCheckMissingDates: @@ -82,10 +82,10 @@ def test_empty_filelist(self): filenames = list() validator.check_missing_date_files(filenames) - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_missing_date_files" in [ - err.check_data_id[0] for err in validator.raised_errors] - assert len(validator.raised_errors[0].expression) == 9 + err.check_data_id[0] for err in validator.active_report.raised_errors] + assert len(validator.active_report.raised_errors[0].expression) == 9 def test_same_day(self): params = {"data_source": "", "span_length": 0, @@ -95,9 +95,9 @@ def test_same_day(self): filenames = [("20200901_county_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames) - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 assert "check_missing_date_files" not in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_duplicate_dates(self): params = {"data_source": "", "span_length": 1, @@ -110,14 +110,14 @@ def test_duplicate_dates(self): ("20200903_usa_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames) - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_missing_date_files" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert len([err.expression[0] for - err in validator.raised_errors if err.check_data_id[0] == + err in validator.active_report.raised_errors if err.check_data_id[0] == "check_missing_date_files"]) == 1 assert [err.expression[0] for - err in validator.raised_errors if err.check_data_id[0] == + err in validator.active_report.raised_errors if err.check_data_id[0] == "check_missing_date_files"][0] == datetime.strptime("20200902", "%Y%m%d").date() @@ -153,18 +153,18 @@ def test_empty_df(self): empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "county") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_invalid_geo_type(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "hello") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_geo_type" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert [err.expression for - err in validator.raised_errors if err.check_data_id[0] == + err in validator.active_report.raised_errors if err.check_data_id[0] == "check_geo_type"][0] == "hello" def test_invalid_geo_id_county(self): @@ -173,10 +173,10 @@ def test_invalid_geo_id_county(self): "abc12"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "county") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "54321" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "54321" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_msa(self): validator = Validator(self.params) @@ -184,10 +184,10 @@ def test_invalid_geo_id_msa(self): "abc12"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "msa") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "54321" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "54321" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_hrr(self): validator = Validator(self.params) @@ -195,12 +195,12 @@ def test_invalid_geo_id_hrr(self): "a", ".", "ab1"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "hrr") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 5 - assert "1" not in validator.raised_errors[0].expression - assert "12" not in validator.raised_errors[0].expression - assert "123" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 5 + assert "1" not in validator.active_report.raised_errors[0].expression + assert "12" not in validator.active_report.raised_errors[0].expression + assert "123" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_state(self): validator = Validator(self.params) @@ -208,12 +208,12 @@ def test_invalid_geo_id_state(self): "Hawaii", "a", "H.I."], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "state") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 4 - assert "aa" not in validator.raised_errors[0].expression - assert "hi" not in validator.raised_errors[0].expression - assert "HI" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 4 + assert "aa" not in validator.active_report.raised_errors[0].expression + assert "hi" not in validator.active_report.raised_errors[0].expression + assert "HI" not in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_national(self): validator = Validator(self.params) @@ -221,12 +221,12 @@ def test_invalid_geo_id_national(self): "usausa", "US"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "national") - assert len(validator.raised_errors) == 1 - assert "check_geo_id_format" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 3 - assert "us" not in validator.raised_errors[0].expression - assert "US" not in validator.raised_errors[0].expression - assert "SP" not in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_geo_id_format" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 3 + assert "us" not in validator.active_report.raised_errors[0].expression + assert "US" not in validator.active_report.raised_errors[0].expression + assert "SP" not in validator.active_report.raised_errors[0].expression class TestDuplicatedRows: params = {"data_source": "", "span_length": 1, @@ -235,35 +235,35 @@ def test_no_duplicates(self): validator = Validator(self.params) df = pd.DataFrame([["a", "1"], ["b", "2"], ["c", "3"]]) validator.check_duplicate_rows(df, "file") - assert len(validator.raised_warnings) == 0 + assert len(validator.active_report.raised_warnings) == 0 def test_single_column_duplicates_but_not_row(self): validator = Validator(self.params) df = pd.DataFrame([["a", "1"], ["a", "2"], ["b", "2"]]) validator.check_duplicate_rows(df, "file") - assert len(validator.raised_warnings) == 0 + assert len(validator.active_report.raised_warnings) == 0 def test_non_consecutive_duplicates(self): validator = Validator(self.params) df = pd.DataFrame([["a", "1"], ["b", "2"], ["a", "1"]]) validator.check_duplicate_rows(df, "file") - assert len(validator.raised_warnings) == 1 - assert validator.raised_warnings[0].expression == [2] - assert validator.raised_warnings[0].check_data_id[1] == "file" + assert len(validator.active_report.raised_warnings) == 1 + assert validator.active_report.raised_warnings[0].expression == [2] + assert validator.active_report.raised_warnings[0].check_data_id[1] == "file" def test_multiple_distinct_duplicates(self): validator = Validator(self.params) df = pd.DataFrame([["a", "1"], ["b", "2"], ["a", "1"], ["b", "2"]]) validator.check_duplicate_rows(df, "file") - assert len(validator.raised_warnings) == 1 - assert validator.raised_warnings[0].expression == [2, 3] + assert len(validator.active_report.raised_warnings) == 1 + assert validator.active_report.raised_warnings[0].expression == [2, 3] def test_more_than_two_copies(self): validator = Validator(self.params) df = pd.DataFrame([["a", "1"], ["b", "2"], ["b", "2"], ["b", "2"]]) validator.check_duplicate_rows(df, "file") - assert len(validator.raised_warnings) == 1 - assert validator.raised_warnings[0].expression == [2, 3] + assert len(validator.active_report.raised_warnings) == 1 + assert validator.active_report.raised_warnings[0].expression == [2, 3] class TestCheckBadGeoIdValue: params = {"data_source": "", "span_length": 0, @@ -274,31 +274,31 @@ def test_empty_df(self): validator = Validator(self.params) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_value(empty_df, "name", "county") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_invalid_geo_id_county(self): validator = Validator(self.params) df = pd.DataFrame(["01001", "88888", "99999"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "county") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "01001" not in validator.raised_errors[0].expression - assert "88888" in validator.raised_errors[0].expression - assert "99999" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "01001" not in validator.active_report.raised_errors[0].expression + assert "88888" in validator.active_report.raised_errors[0].expression + assert "99999" in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_msa(self): validator = Validator(self.params) df = pd.DataFrame(["10180", "88888", "99999"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "msa") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 2 - assert "10180" not in validator.raised_errors[0].expression - assert "88888" in validator.raised_errors[0].expression - assert "99999" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 2 + assert "10180" not in validator.active_report.raised_errors[0].expression + assert "88888" in validator.active_report.raised_errors[0].expression + assert "99999" in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_hrr(self): validator = Validator(self.params) @@ -306,47 +306,47 @@ def test_invalid_geo_id_hrr(self): "888"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "hrr") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 3 - assert "1" not in validator.raised_errors[0].expression - assert "11" not in validator.raised_errors[0].expression - assert "111" not in validator.raised_errors[0].expression - assert "8" in validator.raised_errors[0].expression - assert "88" in validator.raised_errors[0].expression - assert "888" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 3 + assert "1" not in validator.active_report.raised_errors[0].expression + assert "11" not in validator.active_report.raised_errors[0].expression + assert "111" not in validator.active_report.raised_errors[0].expression + assert "8" in validator.active_report.raised_errors[0].expression + assert "88" in validator.active_report.raised_errors[0].expression + assert "888" in validator.active_report.raised_errors[0].expression def test_invalid_geo_id_state(self): validator = Validator(self.params) df = pd.DataFrame(["aa", "ak"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "state") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 1 - assert "ak" not in validator.raised_errors[0].expression - assert "aa" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 1 + assert "ak" not in validator.active_report.raised_errors[0].expression + assert "aa" in validator.active_report.raised_errors[0].expression def test_uppercase_geo_id(self): validator = Validator(self.params) df = pd.DataFrame(["ak", "AK"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "state") - assert len(validator.raised_errors) == 0 - assert len(validator.raised_warnings) == 1 - assert "check_geo_id_lowercase" in validator.raised_warnings[0].check_data_id - assert "AK" in validator.raised_warnings[0].expression + assert len(validator.active_report.raised_errors) == 0 + assert len(validator.active_report.raised_warnings) == 1 + assert "check_geo_id_lowercase" in validator.active_report.raised_warnings[0].check_data_id + assert "AK" in validator.active_report.raised_warnings[0].expression def test_invalid_geo_id_national(self): validator = Validator(self.params) df = pd.DataFrame(["us", "zz"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "national") - assert len(validator.raised_errors) == 1 - assert "check_bad_geo_id_value" in validator.raised_errors[0].check_data_id - assert len(validator.raised_errors[0].expression) == 1 - assert "us" not in validator.raised_errors[0].expression - assert "zz" in validator.raised_errors[0].expression + assert len(validator.active_report.raised_errors) == 1 + assert "check_bad_geo_id_value" in validator.active_report.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors[0].expression) == 1 + assert "us" not in validator.active_report.raised_errors[0].expression + assert "zz" in validator.active_report.raised_errors[0].expression class TestCheckBadVal: @@ -360,39 +360,39 @@ def test_empty_df(self): validator.check_bad_val(empty_df, "", "prop") validator.check_bad_val(empty_df, "", "pct") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_missing(self): validator = Validator(self.params) df = pd.DataFrame([np.nan], columns=["val"]) validator.check_bad_val(df, "name", "signal") - assert len(validator.raised_errors) == 1 - assert "check_val_missing" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_missing" in validator.active_report.raised_errors[0].check_data_id def test_lt_0(self): validator = Validator(self.params) df = pd.DataFrame([-5], columns=["val"]) validator.check_bad_val(df, "name", "signal") - assert len(validator.raised_errors) == 1 - assert "check_val_lt_0" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_lt_0" in validator.active_report.raised_errors[0].check_data_id def test_gt_max_pct(self): validator = Validator(self.params) df = pd.DataFrame([1e7], columns=["val"]) validator.check_bad_val(df, "name", "pct") - assert len(validator.raised_errors) == 1 - assert "check_val_pct_gt_100" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_pct_gt_100" in validator.active_report.raised_errors[0].check_data_id def test_gt_max_prop(self): validator = Validator(self.params) df = pd.DataFrame([1e7], columns=["val"]) validator.check_bad_val(df, "name", "prop") - assert len(validator.raised_errors) == 1 - assert "check_val_prop_gt_100k" in validator.raised_errors[0].check_data_id + assert len(validator.active_report.raised_errors) == 1 + assert "check_val_prop_gt_100k" in validator.active_report.raised_errors[0].check_data_id class TestCheckBadSe: @@ -405,12 +405,12 @@ def test_empty_df(self): columns=["val", "se", "sample_size"], dtype=float) validator.check_bad_se(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_se_allowed = True validator.check_bad_se(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_missing(self): validator = Validator(self.params) @@ -419,16 +419,16 @@ def test_missing(self): "val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_se_allowed = False validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_many_missing" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_e_0_missing_allowed(self): validator = Validator(self.params) @@ -437,11 +437,11 @@ def test_e_0_missing_allowed(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_missing_or_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_0" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_e_0_missing_not_allowed(self): validator = Validator(self.params) @@ -450,11 +450,11 @@ def test_e_0_missing_not_allowed(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_0" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_jeffreys(self): validator = Validator(self.params) @@ -463,11 +463,11 @@ def test_jeffreys(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name") - assert len(validator.raised_errors) == 2 + assert len(validator.active_report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] assert "check_se_0_when_val_0" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestCheckBadN: @@ -480,12 +480,12 @@ def test_empty_df(self): columns=["val", "se", "sample_size"], dtype=float) validator.check_bad_sample_size(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_sample_size_allowed = True validator.check_bad_sample_size(empty_df, "") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_missing(self): validator = Validator(self.params) @@ -494,14 +494,14 @@ def test_missing(self): "val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 validator.missing_sample_size_allowed = False validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_n_missing" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_lt_min_missing_allowed(self): validator = Validator(self.params) @@ -510,9 +510,9 @@ def test_lt_min_missing_allowed(self): 1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_n_missing_or_gt_min" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_lt_min_missing_not_allowed(self): validator = Validator(self.params) @@ -521,9 +521,9 @@ def test_lt_min_missing_not_allowed(self): 1, np.nan, 245]], columns=["val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_n_gt_min" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestCheckRapidChange: @@ -537,7 +537,7 @@ def test_same_df(self): validator.check_rapid_change_num_rows( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_0_vs_many(self): validator = Validator(self.params) @@ -549,9 +549,9 @@ def test_0_vs_many(self): validator.check_rapid_change_num_rows( test_df, ref_df, time_value, "geo", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_rapid_change_num_rows" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestCheckAvgValDiffs: @@ -570,7 +570,7 @@ def test_same_val(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_same_se(self): validator = Validator(self.params) @@ -584,7 +584,7 @@ def test_same_se(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_same_n(self): validator = Validator(self.params) @@ -598,7 +598,7 @@ def test_same_n(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_same_val_se_n(self): validator = Validator(self.params) @@ -612,7 +612,7 @@ def test_same_val_se_n(self): validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_10x_val(self): validator = Validator(self.params) @@ -627,7 +627,7 @@ def test_10x_val(self): test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_100x_val(self): validator = Validator(self.params) @@ -642,9 +642,9 @@ def test_100x_val(self): test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_test_vs_reference_avg_changed" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_1000x_val(self): validator = Validator(self.params) @@ -659,10 +659,9 @@ def test_1000x_val(self): test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_test_vs_reference_avg_changed" in [ - err.check_data_id[0] for err in validator.raised_errors] - + err.check_data_id[0] for err in validator.active_report.raised_errors] class TestDataOutlier: params = {"data_source": "", "span_length": 1, @@ -702,9 +701,9 @@ def test_pos_outlier(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_neg_outlier(self): validator = Validator(self.params) @@ -742,9 +741,9 @@ def test_neg_outlier(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_zero_outlier(self): validator = Validator(self.params) @@ -782,9 +781,9 @@ def test_zero_outlier(self): - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] def test_no_outlier(self): validator = Validator(self.params) @@ -822,7 +821,7 @@ def test_no_outlier(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 0 + assert len(validator.active_report.raised_errors) == 0 def test_source_api_overlap(self): validator = Validator(self.params) @@ -860,6 +859,6 @@ def test_source_api_overlap(self): test_df, ref_df, "state", "signal") - assert len(validator.raised_errors) == 1 + assert len(validator.active_report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ - err.check_data_id[0] for err in validator.raised_errors] + err.check_data_id[0] for err in validator.active_report.raised_errors] diff --git a/validator/tests/test_report.py b/validator/tests/test_report.py new file mode 100644 index 000000000..5269a6857 --- /dev/null +++ b/validator/tests/test_report.py @@ -0,0 +1,44 @@ +"""Tests for delphi_validator.report.""" +from datetime import date +from delphi_validator.errors import ValidationError +from delphi_validator.report import ValidationReport + +class TestValidationReport: + """Tests for ValidationReport class.""" + + ERROR_1 = ValidationError(("good", date(2020, 10, 5)), "exp 1", "msg 1") + ERROR_2 = ValidationError(("bad", date(2020, 11, 18)), "exp 2", "msg 2") + + def test_add_raised_unsuppressed_error(self): + """Test that an unsupressed error shows up in the unsuppressed error list.""" + report = ValidationReport([("bad", "2020-10-05")]) + report.add_raised_error(self.ERROR_1) + report.add_raised_error(self.ERROR_2) + assert report.unsuppressed_errors == [self.ERROR_1, self.ERROR_2] + + def test_add_raised_suppressed_error(self): + """Test that an supressed error does not show up in the unsuppressed error list.""" + report = ValidationReport([("good", "2020-10-05")]) + report.add_raised_error(self.ERROR_1) + + assert len(report.unsuppressed_errors) == 0 + assert report.num_suppressed == 1 + assert len(report.errors_to_suppress) == 0 + + # Each error can only be surpressed once. + report.add_raised_error(self.ERROR_1) + assert report.unsuppressed_errors == [self.ERROR_1] + + def test_str(self): + """Test that the string representation contains all information.""" + report = ValidationReport([("good", "2020-10-05")]) + report.increment_total_checks() + report.increment_total_checks() + report.increment_total_checks() + report.add_raised_warning(ImportWarning("wrong import")) + report.add_raised_warning(ImportWarning("right import")) + report.add_raised_error(self.ERROR_1) + report.add_raised_error(self.ERROR_2) + + assert str(report) == "3 checks run\n1 checks failed\n1 checks suppressed\n2 warnings\n"\ + "(('bad', datetime.date(2020, 11, 18)), 'exp 2', 'msg 2')\nwrong import\nright import\n"