Skip to content

Commit 8091221

Browse files
committed
Add a gap detector to Sir Complainsalot
Detects gaps in reported data, e.g. if just one day is missing but more recent days are available.
1 parent cf6f4c1 commit 8091221

File tree

1 file changed

+62
-6
lines changed

1 file changed

+62
-6
lines changed

sir_complainsalot/delphi_sir_complainsalot/check_source.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from dataclasses import dataclass
22
from typing import List
33

4+
import covidcast
5+
import numpy as np
46
import pandas as pd
57

68
@dataclass
@@ -27,33 +29,87 @@ def to_md(self):
2729
message=self.message, updated=self.last_updated.strftime("%Y-%m-%d"))
2830

2931
def check_source(data_source, meta, params, grace):
30-
"""Iterate over all signals from a source and check if they exceed max age."""
32+
"""Iterate over all signals from a source and check for problems.
33+
34+
Possible problems:
35+
36+
- Newest available data exceeds max age.
37+
- Gap between subsequent data points exceeds max gap.
38+
39+
For example, consider a source with a max age of 5 days and max gap of 1
40+
day. If today is 2020-10-15, and the latest available data is from
41+
2020-10-09, the max age is exceeded. If there is no data available on
42+
2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
43+
days and the max gap is exceeded.
44+
45+
The gap window controls how much data we check for gaps -- a gap window of
46+
10 days means we check the most recent 10 days of data. Defaults to 7.
47+
48+
"""
3149

3250
source_config = params[data_source]
51+
gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
52+
max_allowed_gap = source_config.get("max_gap", 1)
3353

3454
signals = meta[meta.data_source == data_source]
3555

3656
now = pd.Timestamp.now()
3757

38-
complaints = {}
58+
age_complaints = {}
59+
gap_complaints = {}
3960

4061
for _, row in signals.iterrows():
4162
if "retired-signals" in source_config and \
4263
row["signal"] in source_config["retired-signals"]:
4364
continue
4465

66+
# Check max age
4567
age = (now - row["max_time"]).days
4668

4769
if age > source_config["max_age"] + grace:
48-
if row["signal"] not in complaints:
49-
complaints[row["signal"]] = Complaint(
70+
if row["signal"] not in age_complaints:
71+
age_complaints[row["signal"]] = Complaint(
5072
"is more than {age} days old".format(age=age),
5173
data_source,
5274
row["signal"],
5375
[row["geo_type"]],
5476
row["max_time"],
5577
source_config["maintainers"])
5678
else:
57-
complaints[row["signal"]].geo_types.append(row["geo_type"])
79+
age_complaints[row["signal"]].geo_types.append(row["geo_type"])
80+
81+
# Check max gap
82+
if max_allowed_gap == -1:
83+
# No gap detection for this source
84+
continue
85+
86+
latest_data = covidcast.signal(
87+
data_source, row["signal"],
88+
start_day=row["max_time"] - gap_window,
89+
end_day=row["max_time"],
90+
geo_type=row["geo_type"]
91+
)
92+
93+
# convert numpy datetime values to pandas datetimes and then to
94+
# datetime.date, so we can work with timedeltas after
95+
unique_dates = [pd.to_datetime(val).date()
96+
for val in latest_data["time_value"].unique()]
97+
98+
gap_days = [(day - prev_day).days
99+
for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]
100+
gap = max(gap_days)
101+
102+
if gap > max_allowed_gap:
103+
if row["signal"] not in gap_complaints:
104+
gap_complaints[row["signal"]] = Complaint(
105+
"has a {gap}-day gap of missing data in its most recent "
106+
"{gap_window} days of data".format(gap=gap, gap_window=gap_window.days),
107+
data_source,
108+
row["signal"],
109+
[row["geo_type"]],
110+
row["max_time"],
111+
source_config["maintainers"])
112+
else:
113+
gap_complaints[row["signal"]].geo_types.append(row["geo_type"])
58114

59-
return list(complaints.values())
115+
return list(age_complaints.values()) + list(gap_complaints.values())

0 commit comments

Comments
 (0)