Skip to content

Commit 127c976

Browse files
authored
Merge pull request #715 from sgsmob/validator
Allow wildcards in validation suppression syntax
2 parents be9e43d + 705463a commit 127c976

File tree

4 files changed

+51
-31
lines changed

4 files changed

+51
-31
lines changed

validator/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ Please update the follow settings:
5555
* `data_source`: should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls
5656
* `end_date`: specifies the last date to be checked; if set to "latest", `end_date` will always be the current date
5757
* `span_length`: specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days
58-
* `suppressed_errors`: list of lists uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected
58+
* `suppressed_errors`: list of pairs of (`check_name`, `file_name`) uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected. Either value can also take on the value `*` to apply to all check or file names.
5959
* `test_mode`: boolean; `true` checks only a small number of data files
6060
* `static`: settings for validations that don't require comparison with external COVIDcast API data
6161
* `minimum_sample_size` (default: 100): threshold for flagging small sample sizes as invalid

validator/delphi_validator/errors.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,13 @@ def is_suppressed(self, suppressed_errors):
3636
errors_to_suppress: Set[Tuple[str]]
3737
set of (check_name, data_name) tuples to ignore.
3838
"""
39-
return (self.check_name, self.data_name) in suppressed_errors
39+
if (self.check_name, self.data_name) in suppressed_errors:
40+
return True
41+
if (self.check_name, "*") in suppressed_errors:
42+
return True
43+
if ("*", self.data_name) in suppressed_errors:
44+
return True
45+
return False
4046

4147
def __str__(self):
4248
return f"{self.check_name} failed for {self.data_name}: {self.message}"

validator/delphi_validator/static.py

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ def check_bad_geo_id_value(self, df_to_test, filename, geo_type, report):
151151
if len(unexpected_geos) > 0:
152152
report.add_raised_error(ValidationFailure("check_bad_geo_id_value",
153153
filename,
154-
f"Unrecognized geo_ids (not in historical "
155-
"data) {unexpected_geos}"))
154+
"Unrecognized geo_ids (not in historical "
155+
f"data) {unexpected_geos}"))
156156
report.increment_total_checks()
157157
upper_case_geos = [
158158
geo for geo in df_to_test['geo_id'] if geo.lower() != geo]
@@ -301,7 +301,18 @@ def check_bad_se(self, df_to_test, nameformat, report):
301301
df_to_test['se'] = df_to_test['se'].round(3)
302302
df_to_test['se_upper_limit'] = df_to_test['se_upper_limit'].round(3)
303303

304-
if not self.params.missing_se_allowed:
304+
if self.params.missing_se_allowed:
305+
result = df_to_test.query(
306+
'~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))')
307+
308+
if not result.empty:
309+
report.add_raised_error(
310+
ValidationFailure("check_se_missing_or_in_range",
311+
nameformat,
312+
"se must be NA or in (0, min(50,val*(1+eps))]"))
313+
314+
report.increment_total_checks()
315+
else:
305316
# Find rows not in the allowed range for se.
306317
result = df_to_test.query(
307318
'~((se > 0) & (se < 50) & (se <= se_upper_limit))')
@@ -322,18 +333,6 @@ def check_bad_se(self, df_to_test, nameformat, report):
322333

323334
report.increment_total_checks()
324335

325-
elif self.params.missing_se_allowed:
326-
result = df_to_test.query(
327-
'~(se.isnull() | ((se > 0) & (se < 50) & (se <= se_upper_limit)))')
328-
329-
if not result.empty:
330-
report.add_raised_error(
331-
ValidationFailure("check_se_missing_or_in_range",
332-
nameformat,
333-
"se must be NA or in (0, min(50,val*(1+eps))]"))
334-
335-
report.increment_total_checks()
336-
337336
result_jeffreys = df_to_test.query('(val == 0) & (se == 0)')
338337
result_alt = df_to_test.query('se == 0')
339338

@@ -367,7 +366,20 @@ def check_bad_sample_size(self, df_to_test, nameformat, report):
367366
Returns:
368367
- None
369368
"""
370-
if not self.params.missing_sample_size_allowed:
369+
if self.params.missing_sample_size_allowed:
370+
result = df_to_test.query(
371+
'~(sample_size.isnull() | (sample_size >= @self.params.minimum_sample_size))')
372+
373+
if not result.empty:
374+
report.add_raised_error(
375+
ValidationFailure("check_n_missing_or_gt_min",
376+
nameformat,
377+
"sample size must be NA or >= "
378+
f"{self.params.minimum_sample_size}"))
379+
380+
report.increment_total_checks()
381+
382+
else:
371383
if df_to_test['sample_size'].isnull().values.any():
372384
report.add_raised_error(
373385
ValidationFailure("check_n_missing",
@@ -388,19 +400,6 @@ def check_bad_sample_size(self, df_to_test, nameformat, report):
388400

389401
report.increment_total_checks()
390402

391-
elif self.params.missing_sample_size_allowed:
392-
result = df_to_test.query(
393-
'~(sample_size.isnull() | (sample_size >= @self.params.minimum_sample_size))')
394-
395-
if not result.empty:
396-
report.add_raised_error(
397-
ValidationFailure("check_n_missing_or_gt_min",
398-
nameformat,
399-
f"sample size must be NA or >= "\
400-
"{self.params.minimum_sample_size}"))
401-
402-
report.increment_total_checks()
403-
404403
def check_duplicate_rows(self, data_df, filename, report):
405404
"""
406405
Check if any rows are duplicated in a data set.

validator/tests/test_errors.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""Tests for errors.py."""
2+
from delphi_validator.errors import ValidationFailure
3+
4+
class TestValidationFailure:
5+
"""Tests for ValidationFailure class."""
6+
7+
def test_is_suppressed(self):
8+
"""Tests the suppression of failures."""
9+
vf = ValidationFailure("a", "b", "c")
10+
assert vf.is_suppressed(set([("a", "b")]))
11+
assert vf.is_suppressed(set([("*", "b")]))
12+
assert vf.is_suppressed(set([("a", "*")]))
13+
assert not vf.is_suppressed(set([("c", "*")]))
14+
assert not vf.is_suppressed(set([("*", "*")]))
15+
assert not vf.is_suppressed(set([("c", "d")]))

0 commit comments

Comments
 (0)