diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py index 07e0a2599..92bd790db 100644 --- a/_delphi_utils_python/delphi_utils/validator/dynamic.py +++ b/_delphi_utils_python/delphi_utils/validator/dynamic.py @@ -110,6 +110,8 @@ def validate(self, all_frames, report): self.check_max_allowed_max_date( max_date, geo_type, signal_type, report) + self.check_na_vals(geo_sig_df, geo_type, signal_type, report) + # Get relevant reference data from API dictionary. api_df_or_error = all_api_df[(geo_type, signal_type)] @@ -168,6 +170,43 @@ def validate(self, all_frames, report): if self.test_mode and kroc == 2: break + def check_na_vals(self, geo_sig_df, geo_type, signal_type, report): + """Check if there are any NA values. + + In particular, make sure that error doesn't occur for new Geo IDs introduced. + + Arguments: + - geo_type: str; geo type name (county, msa, hrr, state) as in the CSV name + - signal_type: str; signal name as in the CSV name + - report: ValidationReport; report where results are added + + Returns: + - None + """ + def replace_first_six(df, start_date): + x = df.val.isnull() + # First 6 days have to be null + x.iloc[:6] = False + df = df[x] + return df.time_value[df.time_value >= start_date] + + grouped_df = geo_sig_df.groupby('geo_id') + error_df = grouped_df.apply(replace_first_six, + start_date = self.params.time_window.start_date) + + if not error_df.empty: + for index, value in error_df.iteritems(): + report.add_raised_error( + ValidationFailure("check_val_missing", + geo_type=geo_type, + signal=signal_type, + date=value, + message=f"geo_id {index[0]}" + ) + ) + + report.increment_total_checks() + def check_min_allowed_max_date(self, max_date, geo_type, signal_type, report): """Check if time since data was generated is reasonable or too long ago. diff --git a/_delphi_utils_python/delphi_utils/validator/static.py b/_delphi_utils_python/delphi_utils/validator/static.py index 767b5761c..48b17b888 100644 --- a/_delphi_utils_python/delphi_utils/validator/static.py +++ b/_delphi_utils_python/delphi_utils/validator/static.py @@ -295,14 +295,6 @@ def check_bad_val(self, df_to_test, nameformat, signal_type, report): report.increment_total_checks() - if df_to_test['val'].isnull().values.any(): - report.add_raised_error( - ValidationFailure("check_val_missing", - filename=nameformat, - message="val column can't have any cell that is NA")) - - report.increment_total_checks() - if not df_to_test[(df_to_test['val'] < 0)].empty: report.add_raised_error( ValidationFailure("check_val_lt_0", diff --git a/_delphi_utils_python/tests/validator/test_dynamic.py b/_delphi_utils_python/tests/validator/test_dynamic.py index 321ce63fb..1f0348315 100644 --- a/_delphi_utils_python/tests/validator/test_dynamic.py +++ b/_delphi_utils_python/tests/validator/test_dynamic.py @@ -106,6 +106,28 @@ def test_0_vs_many(self): assert len(report.raised_errors) == 1 assert report.raised_errors[0].check_name == "check_rapid_change_num_rows" +class TestCheckNaVals: + params = { + "common": { + "data_source": "", + "span_length": 14, + "end_date": "2020-09-02" + } + } + def test_missing(self): + validator = DynamicValidator(self.params) + report = ValidationReport([]) + data = {"val": [np.nan] * 15, "geo_id": [0,1] * 7 + [2], + "time_value": ["2021-08-30"] * 14 + ["2021-05-01"]} + df = pd.DataFrame(data) + df.time_value = (pd.to_datetime(df.time_value)).dt.date + validator.check_na_vals(df, "geo", "signal", report) + + assert len(report.raised_errors) == 2 + assert report.raised_errors[0].check_name == "check_val_missing" + assert report.raised_errors[0].message == "geo_id 0" + assert report.raised_errors[1].check_name == "check_val_missing" + assert report.raised_errors[1].message == "geo_id 1" class TestCheckAvgValDiffs: params = { diff --git a/_delphi_utils_python/tests/validator/test_static.py b/_delphi_utils_python/tests/validator/test_static.py index 09286ba9c..bf270b4fd 100644 --- a/_delphi_utils_python/tests/validator/test_static.py +++ b/_delphi_utils_python/tests/validator/test_static.py @@ -362,15 +362,6 @@ def test_empty_df(self): assert len(report.raised_errors) == 0 - def test_missing(self): - validator = StaticValidator(self.params) - report = ValidationReport([]) - df = pd.DataFrame([np.nan], columns=["val"]) - validator.check_bad_val(df, FILENAME, "signal", report) - - assert len(report.raised_errors) == 1 - assert report.raised_errors[0].check_name == "check_val_missing" - def test_lt_0(self): validator = StaticValidator(self.params) report = ValidationReport([])