Merge pull request #715 from sgsmob/validator

krivard · web-flow · commit 127c97643d3e · 2021-01-22T15:34:54.000-05:00
Allow wildcards in validation suppression syntax
diff --git a/validator/README.md b/validator/README.md
@@ -55,7 +55,7 @@ Please update the follow settings:
    * `data_source`: should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls
    * `end_date`: specifies the last date to be checked; if set to "latest", `end_date` will always be the current date
    * `span_length`: specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days
-   * `suppressed_errors`: list of lists uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected
+   * `suppressed_errors`: list of pairs of (`check_name`, `file_name`) uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected.  Either value can also take on the value `*` to apply to all check or file names.
    * `test_mode`: boolean; `true` checks only a small number of data files
 * `static`: settings for validations that don't require comparison with external COVIDcast API data
    * `minimum_sample_size` (default: 100): threshold for flagging small sample sizes as invalid
diff --git a/validator/delphi_validator/errors.py b/validator/delphi_validator/errors.py
@@ -36,7 +36,13 @@ def is_suppressed(self, suppressed_errors):
         errors_to_suppress: Set[Tuple[str]]
             set of (check_name, data_name) tuples to ignore.
         """
-        return (self.check_name, self.data_name) in suppressed_errors
+        if (self.check_name, self.data_name) in suppressed_errors:
+            return True
+        if (self.check_name, "*") in suppressed_errors:
+            return True
+        if ("*", self.data_name) in suppressed_errors:
+            return True
+        return False
 
     def __str__(self):
         return f"{self.check_name} failed for {self.data_name}: {self.message}"
diff --git a/validator/delphi_validator/static.py b/validator/delphi_validator/static.py
@@ -151,8 +151,8 @@ def check_bad_geo_id_value(self, df_to_test, filename, geo_type, report):
         if len(unexpected_geos) > 0:
             report.add_raised_error(ValidationFailure("check_bad_geo_id_value",
                                                       filename,
-                                                      f"Unrecognized geo_ids (not in historical "
-                                                      "data) {unexpected_geos}"))
+                                                      "Unrecognized geo_ids (not in historical "
+                                                      f"data) {unexpected_geos}"))
         report.increment_total_checks()
         upper_case_geos = [
             geo for geo in df_to_test['geo_id'] if geo.lower() != geo]
@@ -301,7 +301,18 @@ def check_bad_se(self, df_to_test, nameformat, report):
         df_to_test['se'] = df_to_test['se'].round(3)
         df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3)
 
-        if not self.params.missing_se_allowed:
+        if self.params.missing_se_allowed:
+            result = df_to_test.query(
+                '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))')
+
+            if not result.empty:
+                report.add_raised_error(
+                    ValidationFailure("check_se_missing_or_in_range",
+                                      nameformat,
+                                     "se must be NA or in (0, min(50,val*(1+eps))]"))
+
+            report.increment_total_checks()
+        else:
             # Find rows not in the allowed range for se.
             result = df_to_test.query(
                 '~((se > 0) & (se < 50) & (se <= se_upper_limit))')
@@ -322,18 +333,6 @@ def check_bad_se(self, df_to_test, nameformat, report):
 
             report.increment_total_checks()
 
-        elif self.params.missing_se_allowed:
-            result = df_to_test.query(
-                '~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))')
-
-            if not result.empty:
-                report.add_raised_error(
-                    ValidationFailure("check_se_missing_or_in_range",
-                                      nameformat,
-                                     "se must be NA or in (0, min(50,val*(1+eps))]"))
-
-            report.increment_total_checks()
-
         result_jeffreys = df_to_test.query('(val == 0) & (se == 0)')
         result_alt = df_to_test.query('se == 0')
 
@@ -367,7 +366,20 @@ def check_bad_sample_size(self, df_to_test, nameformat, report):
         Returns:
             - None
         """
-        if not self.params.missing_sample_size_allowed:
+        if self.params.missing_sample_size_allowed:
+            result = df_to_test.query(
+                '~(sample_size.isnull() | (sample_size >= @self.params.minimum_sample_size))')
+
+            if not result.empty:
+                report.add_raised_error(
+                    ValidationFailure("check_n_missing_or_gt_min",
+                                      nameformat,
+                                      "sample size must be NA or >= "
+                                      f"{self.params.minimum_sample_size}"))
+
+            report.increment_total_checks()
+
+        else:
             if df_to_test['sample_size'].isnull().values.any():
                 report.add_raised_error(
                     ValidationFailure("check_n_missing",
@@ -388,19 +400,6 @@ def check_bad_sample_size(self, df_to_test, nameformat, report):
 
             report.increment_total_checks()
 
-        elif self.params.missing_sample_size_allowed:
-            result = df_to_test.query(
-                '~(sample_size.isnull() | (sample_size >= @self.params.minimum_sample_size))')
-
-            if not result.empty:
-                report.add_raised_error(
-                    ValidationFailure("check_n_missing_or_gt_min",
-                                      nameformat,
-                                      f"sample size must be NA or >= "\
-                                          "{self.params.minimum_sample_size}"))
-
-            report.increment_total_checks()
-
     def check_duplicate_rows(self, data_df, filename, report):
         """
         Check if any rows are duplicated in a data set.
diff --git a/validator/tests/test_errors.py b/validator/tests/test_errors.py
@@ -0,0 +1,15 @@
+"""Tests for errors.py."""
+from delphi_validator.errors import ValidationFailure
+
+class TestValidationFailure:
+    """Tests for ValidationFailure class."""
+
+    def test_is_suppressed(self):
+        """Tests the suppression of failures."""
+        vf = ValidationFailure("a", "b", "c")
+        assert vf.is_suppressed(set([("a", "b")]))
+        assert vf.is_suppressed(set([("*", "b")]))
+        assert vf.is_suppressed(set([("a", "*")]))
+        assert not vf.is_suppressed(set([("c", "*")]))
+        assert not vf.is_suppressed(set([("*", "*")]))
+        assert not vf.is_suppressed(set([("c", "d")]))