Skip to content

Add a gap detector to Sir Complains-a-lot #327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions sir_complainsalot/delphi_sir_complainsalot/check_source.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass
from typing import List

import covidcast
import numpy as np
import pandas as pd

@dataclass
Expand All @@ -27,33 +29,87 @@ def to_md(self):
message=self.message, updated=self.last_updated.strftime("%Y-%m-%d"))

def check_source(data_source, meta, params, grace):
"""Iterate over all signals from a source and check if they exceed max age."""
"""Iterate over all signals from a source and check for problems.

Possible problems:

- Newest available data exceeds max age.
- Gap between subsequent data points exceeds max gap.

For example, consider a source with a max age of 5 days and max gap of 1
day. If today is 2020-10-15, and the latest available data is from
2020-10-09, the max age is exceeded. If there is no data available on
2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
days and the max gap is exceeded.

The gap window controls how much data we check for gaps -- a gap window of
10 days means we check the most recent 10 days of data. Defaults to 7.

"""

source_config = params[data_source]
gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
max_allowed_gap = source_config.get("max_gap", 1)

signals = meta[meta.data_source == data_source]

now = pd.Timestamp.now()

complaints = {}
age_complaints = {}
gap_complaints = {}

for _, row in signals.iterrows():
if "retired-signals" in source_config and \
row["signal"] in source_config["retired-signals"]:
continue

# Check max age
age = (now - row["max_time"]).days

if age > source_config["max_age"] + grace:
if row["signal"] not in complaints:
complaints[row["signal"]] = Complaint(
if row["signal"] not in age_complaints:
age_complaints[row["signal"]] = Complaint(
"is more than {age} days old".format(age=age),
data_source,
row["signal"],
[row["geo_type"]],
row["max_time"],
source_config["maintainers"])
else:
complaints[row["signal"]].geo_types.append(row["geo_type"])
age_complaints[row["signal"]].geo_types.append(row["geo_type"])

# Check max gap
if max_allowed_gap == -1:
# No gap detection for this source
continue

latest_data = covidcast.signal(
data_source, row["signal"],
start_day=row["max_time"] - gap_window,
end_day=row["max_time"],
geo_type=row["geo_type"]
)

# convert numpy datetime values to pandas datetimes and then to
# datetime.date, so we can work with timedeltas after
unique_dates = [pd.to_datetime(val).date()
for val in latest_data["time_value"].unique()]

gap_days = [(day - prev_day).days
for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]
gap = max(gap_days)

if gap > max_allowed_gap:
if row["signal"] not in gap_complaints:
gap_complaints[row["signal"]] = Complaint(
"has a {gap}-day gap of missing data in its most recent "
"{gap_window} days of data".format(gap=gap, gap_window=gap_window.days),
data_source,
row["signal"],
[row["geo_type"]],
row["max_time"],
source_config["maintainers"])
else:
gap_complaints[row["signal"]].geo_types.append(row["geo_type"])

return list(complaints.values())
return list(age_complaints.values()) + list(gap_complaints.values())
2 changes: 1 addition & 1 deletion sir_complainsalot/delphi_sir_complainsalot/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def run_module():

complaints = []
for data_source in params["sources"].keys():
complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace",0)))
complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace", 0)))

if len(complaints) > 0:
for complaint in complaints:
Expand Down
9 changes: 9 additions & 0 deletions sir_complainsalot/params.json.template
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
"max_age": 5,
"maintainers": ["U010VE2T51N"]
},
"hospital-admissions": {
"max_age": 5,
"maintainers": ["U010VE2T51N"],
"retired-signals": ["smoothed_covid19", "smoothed_adj_covid19"]
},
"ght": {
"max_age": 5,
"maintainers": ["U010VE2T51N"]
Expand All @@ -14,6 +19,10 @@
"max_age": 2,
"maintainers": ["UUCGWMJ5P"]
},
"usa-facts": {
"max_age": 2,
"maintainers": ["UUCGWMJ5P"]
},
"safegraph": {
"max_age": 4,
"maintainers": ["U010VE2T51N"]
Expand Down