Skip to content

Commit 8049cb0

Browse files
authored
Merge pull request #2007 from cmu-delphi/ndefries/google-symptoms-patching-fn
`google-symptoms` patching fn should not return data for the 4 days immediately before the issue date
2 parents d4782f7 + 9a767c4 commit 8049cb0

File tree

7 files changed

+112
-85
lines changed

7 files changed

+112
-85
lines changed

google_symptoms/delphi_google_symptoms/date_utils.py

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,47 +17,36 @@ def generate_patch_dates(params: Dict) -> Dict[date, Tuple[date]]:
1717
1818
Parameters
1919
----------
20-
issue_date
21-
end_date
22-
params
20+
params: dictionary parsed from params.json
2321
2422
Returns
2523
-------
26-
dict of date and tuple of date
24+
dict(date: dict(export date range settings))
2725
"""
2826
issue_date = datetime.strptime(params["patch"]["start_issue"], "%Y-%m-%d")
2927
end_date = datetime.strptime(params["patch"]["end_issue"], "%Y-%m-%d")
30-
num_export_days = _generate_num_export_days(params)
28+
num_export_days = params["validation"]["common"].get("span_length", 14)
3129

3230
patch_dates = dict()
3331
while issue_date <= end_date:
34-
# negate the subtraction done within generate_query_dates
35-
expected_start_dt = issue_date - timedelta(days=num_export_days - PAD_DAYS + 1)
36-
daterange = generate_query_dates(expected_start_dt, issue_date, num_export_days, True)
37-
patch_dates[issue_date] = tuple(daterange)
38-
issue_date += timedelta(days=1)
39-
return patch_dates
32+
global_max_expected_lag = get_max_lag(params)
33+
export_end_date = issue_date - timedelta(days=global_max_expected_lag + 1)
34+
export_start_date = issue_date - timedelta(days=num_export_days + global_max_expected_lag + 1)
4035

36+
patch_dates[issue_date] = {
37+
"export_start_date": export_start_date,
38+
"export_end_date": export_end_date,
39+
"num_export_days": num_export_days,
40+
}
4141

42-
def _generate_num_export_days(params: Dict) -> int:
43-
"""
44-
Generate dates for exporting with possible lag.
42+
issue_date += timedelta(days=1)
4543

46-
Parameters
47-
----------
48-
params: dictionary parsed from params.json
44+
return patch_dates
4945

50-
Returns
51-
-------
52-
number of export days
53-
"""
54-
# Calculate number of days based on what's missing from the API and
55-
# what the validator expects.
46+
def get_max_lag(params: Dict) -> int:
47+
"""Determine reporting lag for data source"""
5648
max_expected_lag = lag_converter(params["validation"]["common"].get("max_expected_lag", {"all": 4}))
57-
global_max_expected_lag = max(list(max_expected_lag.values()))
58-
num_export_days = params["validation"]["common"].get("span_length", 14) + global_max_expected_lag
59-
return num_export_days
60-
49+
return max(list(max_expected_lag.values()))
6150

6251
def generate_num_export_days(params: Dict, logger) -> [int]:
6352
"""
@@ -85,7 +74,7 @@ def generate_num_export_days(params: Dict, logger) -> [int]:
8574
# Fetch metadata to check how recent each signal is
8675
covidcast.use_api_key(params["indicator"]["api_credentials"])
8776
metadata = covidcast.metadata()
88-
# Filter to only those we currently want to produce, ignore any old or deprecated signals
77+
# Filter to only those signals we currently want to produce for `google-symptoms`
8978
gs_metadata = metadata[(metadata.data_source == "google-symptoms") & (metadata.signal.isin(sensor_names))]
9079

9180
num_export_days = params["indicator"]["num_export_days"]
@@ -98,7 +87,8 @@ def generate_num_export_days(params: Dict, logger) -> [int]:
9887
num_export_days = (export_end_date - FULL_BKFILL_START_DATE).days + 1
9988
else:
10089
latest_date_diff = (datetime.today() - to_datetime(min(gs_metadata.max_time))).days + 1
101-
expected_date_diff = _generate_num_export_days(params)
90+
global_max_expected_lag = get_max_lag(params)
91+
expected_date_diff = params["validation"]["common"].get("span_length", 14) + global_max_expected_lag
10292

10393
if latest_date_diff > expected_date_diff:
10494
logger.info(f"Missing dates from: {to_datetime(min(gs_metadata.max_time)).date()}")
@@ -138,3 +128,4 @@ def generate_query_dates(
138128
retrieve_dates = [start_date - timedelta(days=PAD_DAYS - 1), export_end_date]
139129

140130
return retrieve_dates
131+

google_symptoms/delphi_google_symptoms/patch.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,17 +67,24 @@ def patch(params):
6767
patch_dates = generate_patch_dates(params)
6868

6969
while issue_date <= end_issue:
70-
daterange = patch_dates[issue_date]
7170
logger.info(f"""Running issue {issue_date.strftime("%Y-%m-%d")}""")
71+
72+
# Output dir setup
7273
current_issue_yyyymmdd = issue_date.strftime("%Y%m%d")
7374
current_issue_dir = f"""{params["patch"]["patch_dir"]}/issue_{current_issue_yyyymmdd}/google-symptom"""
7475
makedirs(f"{current_issue_dir}", exist_ok=True)
76+
7577
params["common"]["export_dir"] = f"""{current_issue_dir}"""
78+
params["indicator"]["custom_run"] = True
79+
80+
date_settings = patch_dates[issue_date]
81+
82+
params["indicator"]["export_start_date"] = date_settings["export_start_date"].strftime("%Y-%m-%d")
83+
params["indicator"]["export_end_date"] = date_settings["export_end_date"].strftime("%Y-%m-%d")
84+
params["indicator"]["num_export_days"] = date_settings["num_export_days"]
7685

77-
params["indicator"]["export_start_date"] = daterange[0].strftime("%Y-%m-%d")
78-
params["indicator"]["export_end_date"] = daterange[1].strftime("%Y-%m-%d")
79-
params["patch"]["patch_flag"] = True
8086
run_module(params, logger)
87+
8188
issue_date += timedelta(days=1)
8289

8390

google_symptoms/delphi_google_symptoms/pull.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from google.oauth2 import service_account
1010

1111
from .constants import COMBINED_METRIC, DC_FIPS, DTYPE_CONVERSIONS, METRICS, SYMPTOM_SETS
12+
from .date_utils import generate_query_dates
1213

1314
# Create map of BigQuery symptom column names to desired column names.
1415
colname_map = {"symptom_" +
@@ -214,7 +215,7 @@ def initialize_credentials(credentials):
214215
pandas_gbq.context.project = credentials.project_id
215216

216217

217-
def pull_gs_data(credentials, export_date_range):
218+
def pull_gs_data(credentials, export_start_date, export_end_date, num_export_days, custom_run_flag):
218219
"""Pull latest dataset for each geo level and combine.
219220
220221
PS: No information for PR
@@ -237,7 +238,7 @@ def pull_gs_data(credentials, export_date_range):
237238
dict: {"county": pd.DataFrame, "state": pd.DataFrame}
238239
"""
239240
# Fetch and format dates we want to attempt to retrieve
240-
241+
export_date_range = generate_query_dates(export_start_date, export_end_date, num_export_days, custom_run_flag)
241242
retrieve_dates = format_dates_for_query(export_date_range)
242243

243244
initialize_credentials(credentials)

google_symptoms/delphi_google_symptoms/run.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from delphi_utils import create_export_csv, get_structured_logger
1313

1414
from .constants import COMBINED_METRIC, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
15-
from .date_utils import generate_num_export_days, generate_query_dates
15+
from .date_utils import generate_num_export_days
1616
from .geo import geo_map
1717
from .pull import pull_gs_data
1818

@@ -58,10 +58,9 @@ def run_module(params, logger=None):
5858
custom_run_flag = (
5959
False if not params["indicator"].get("custom_run", False) else params["indicator"].get("custom_run", False)
6060
)
61-
export_date_range = generate_query_dates(export_start_date, export_end_date, num_export_days, custom_run_flag)
6261

6362
# Pull GS data
64-
dfs = pull_gs_data(params["indicator"]["bigquery_credentials"], export_date_range)
63+
dfs = pull_gs_data(params["indicator"]["bigquery_credentials"], export_start_date, export_end_date, num_export_days, custom_run_flag)
6564
for geo_res in GEO_RESOLUTIONS:
6665
if geo_res == "state":
6766
df_pull = dfs["state"]

google_symptoms/tests/test_date_utils.py

Lines changed: 47 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
from freezegun import freeze_time
55
from conftest import TEST_DIR, NEW_DATE
66

7+
import covidcast
8+
79
from delphi_utils.validator.utils import lag_converter
810
from delphi_google_symptoms.constants import FULL_BKFILL_START_DATE
911
from delphi_google_symptoms.date_utils import generate_query_dates, generate_num_export_days, generate_patch_dates
12+
13+
1014
class TestDateUtils:
1115

1216
@freeze_time("2021-01-05")
@@ -36,57 +40,77 @@ def test_generate_query_dates_custom(self):
3640
assert set(output) == set(expected)
3741

3842
def test_generate_export_dates(self, params, logger, monkeypatch):
39-
import covidcast
4043
metadata_df = pd.read_csv(f"{TEST_DIR}/test_data/covid_metadata.csv")
4144
monkeypatch.setattr(covidcast, "metadata", lambda: metadata_df)
42-
num_export_days = generate_num_export_days(params, logger)
4345

46+
num_export_days = generate_num_export_days(params, logger)
4447
expected_num_export_days = params["indicator"]["num_export_days"]
45-
4648
assert num_export_days == expected_num_export_days
4749

4850
def test_generate_export_dates_normal(self, params_w_no_date, logger, monkeypatch):
49-
import covidcast
5051
metadata_df = pd.read_csv(f"{TEST_DIR}/test_data/covid_metadata.csv")
5152
monkeypatch.setattr(covidcast, "metadata", lambda: metadata_df)
53+
5254
num_export_days = generate_num_export_days(params_w_no_date, logger)
5355

54-
max_expected_lag = lag_converter(params_w_no_date["validation"]["common"].get("max_expected_lag", {"all": 4}))
56+
max_expected_lag = lag_converter(params_w_no_date["validation"]["common"]["max_expected_lag"])
5557
global_max_expected_lag = max(list(max_expected_lag.values()))
56-
expected_num_export_days = params_w_no_date["validation"]["common"].get("span_length", 14) + global_max_expected_lag
58+
expected_num_export_days = params_w_no_date["validation"]["common"]["span_length"] + global_max_expected_lag
5759

5860
assert num_export_days == expected_num_export_days
5961

6062
def test_generate_export_date_missing(self, params_w_no_date, logger, monkeypatch):
61-
import covidcast
6263
metadata_df = pd.read_csv(f"{TEST_DIR}/test_data/covid_metadata_missing.csv")
6364
monkeypatch.setattr(covidcast, "metadata", lambda: metadata_df)
65+
6466
num_export_days = generate_num_export_days(params_w_no_date, logger)
6567
expected_num_export_days = (date.today() - FULL_BKFILL_START_DATE.date()).days + 1
6668
assert num_export_days == expected_num_export_days
6769

68-
def test_generate_patch_dates(self, params_w_patch, logger, monkeypatch):
69-
import covidcast
70-
metadata_df = pd.read_csv(f"{TEST_DIR}/test_data/covid_metadata_missing.csv")
71-
monkeypatch.setattr(covidcast, "metadata", lambda: metadata_df)
72-
max_expected_lag = lag_converter(params_w_patch["validation"]["common"].get("max_expected_lag", {"all": 4}))
70+
def generate_expected_start_end_dates(self, params_, issue_date):
71+
# Actual dates reported on issue dates June 27-29, 2024, by the old
72+
# version of the google-symptoms indicator
73+
# (https://github.com/cmu-delphi/covidcast-indicators/tree/b338a0962bf3a63f70a83f0b719516f914b098e2).
74+
# The patch module should be able to recreate these dates.
75+
dates_dict = {
76+
"2024-06-27": [ '2024-06-02', '2024-06-03', '2024-06-04', '2024-06-05', '2024-06-06', '2024-06-07', '2024-06-08', '2024-06-09', '2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16', '2024-06-17', '2024-06-18', '2024-06-19', '2024-06-20', '2024-06-21', '2024-06-22'],
77+
"2024-06-28": ['2024-06-03', '2024-06-04', '2024-06-05', '2024-06-06', '2024-06-07', '2024-06-08', '2024-06-09', '2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16', '2024-06-17', '2024-06-18', '2024-06-19', '2024-06-20', '2024-06-21', '2024-06-22', '2024-06-23'],
78+
"2024-06-29": ['2024-06-04', '2024-06-05', '2024-06-06','2024-06-07', '2024-06-08', '2024-06-09', '2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16', '2024-06-17', '2024-06-18', '2024-06-19', '2024-06-20', '2024-06-21', '2024-06-22', '2024-06-23', '2024-06-24'],
79+
}
80+
81+
dates_dict = {
82+
datetime.strptime(key, "%Y-%m-%d"): [
83+
datetime.strptime(listvalue, "%Y-%m-%d") for listvalue in value
84+
] for key, value in dates_dict.items()
85+
}
86+
87+
dates = dates_dict[issue_date]
88+
89+
# Raw signals add 6 extra dates of padding for later calculating
90+
# smoothed signals. Since this test is checking an early step in the
91+
# process, before padding has happened, we can drop the first 6
92+
# dates.
93+
return {
94+
"export_start_date": min(dates[6:21]),
95+
"export_end_date": max(dates[6:21])
96+
}
97+
98+
def test_generate_patch_dates(self, params_w_patch, logger):
99+
max_expected_lag = lag_converter(params_w_patch["validation"]["common"]["max_expected_lag"])
73100
global_max_expected_lag = max(list(max_expected_lag.values()))
74-
expected_num_export_days = params_w_patch["validation"]["common"].get("span_length", 14) + global_max_expected_lag
101+
num_export_days = params_w_patch["validation"]["common"]["span_length"]
75102

76103
issue_date = datetime.strptime(params_w_patch["patch"]["start_issue"], "%Y-%m-%d")
77104
end_issue = datetime.strptime(params_w_patch["patch"]["end_issue"], "%Y-%m-%d")
78105

79106
patch_date_dict = generate_patch_dates(params_w_patch)
80107

81108
while issue_date <= end_issue:
82-
expected_daterange = generate_query_dates(
83-
FULL_BKFILL_START_DATE,
84-
issue_date,
85-
expected_num_export_days,
86-
False
87-
)
88109
# in the patch script the date generated by generate_patch_dates becomes the export_start_date and export_end_date
89-
export_start_date, export_end_date = patch_date_dict[issue_date]
90-
actual_daterange = generate_query_dates(export_start_date, export_end_date, expected_num_export_days, True)
91-
assert set(actual_daterange) == set(expected_daterange)
92-
issue_date += timedelta(days=1)
110+
patch_settings = patch_date_dict[issue_date]
111+
expected_dict = self.generate_expected_start_end_dates(params_w_patch, issue_date)
112+
expected_dict["num_export_days"] = num_export_days # unmodified
113+
114+
assert patch_settings == expected_dict
115+
116+
issue_date += timedelta(days=1)

google_symptoms/tests/test_patch.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,29 @@ def parse_csv_file(self, file_list: List[str]) -> Tuple[List[datetime]]:
2525
return sorted(smoothed_list), sorted(raw_list)
2626

2727
def generate_expected_dates(self, params_, smoother, issue_date):
28-
max_expected_lag = lag_converter(params_["validation"]["common"].get("max_expected_lag", {"all": 4}))
29-
global_max_expected_lag = max(list(max_expected_lag.values()))
30-
31-
if params_["indicator"].get("num_export_days"):
32-
num_export_days = params_["indicator"]["num_export_days"]
28+
# Actual dates reported on issue dates June 27-29, 2024, by the old
29+
# version of the google-symptoms indicator
30+
# (https://github.com/cmu-delphi/covidcast-indicators/tree/b338a0962bf3a63f70a83f0b719516f914b098e2).
31+
# The patch module should be able to recreate these dates.
32+
dates_dict = {
33+
"2024-06-27": [ '2024-06-02', '2024-06-03', '2024-06-04', '2024-06-05', '2024-06-06', '2024-06-07', '2024-06-08', '2024-06-09', '2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16', '2024-06-17', '2024-06-18', '2024-06-19', '2024-06-20', '2024-06-21', '2024-06-22'],
34+
"2024-06-28": ['2024-06-03', '2024-06-04', '2024-06-05', '2024-06-06', '2024-06-07', '2024-06-08', '2024-06-09', '2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16', '2024-06-17', '2024-06-18', '2024-06-19', '2024-06-20', '2024-06-21', '2024-06-22', '2024-06-23'],
35+
"2024-06-29": ['2024-06-04', '2024-06-05', '2024-06-06','2024-06-07', '2024-06-08', '2024-06-09', '2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16', '2024-06-17', '2024-06-18', '2024-06-19', '2024-06-20', '2024-06-21', '2024-06-22', '2024-06-23', '2024-06-24'],
36+
}
37+
38+
dates_dict = {
39+
datetime.strptime(key, "%Y-%m-%d"): [
40+
datetime.strptime(listvalue, "%Y-%m-%d") for listvalue in value
41+
] for key, value in dates_dict.items()
42+
}
43+
44+
dates = dates_dict[issue_date]
45+
46+
if smoother == "raw":
47+
return dates
3348
else:
34-
num_export_days = params_["validation"]["common"].get("span_length", 14) + global_max_expected_lag
35-
36-
# mimic date generate as if the issue date was "today"
37-
query_start_date, query_end_date = generate_query_dates(
38-
FULL_BKFILL_START_DATE,
39-
issue_date,
40-
num_export_days,
41-
False
42-
)
43-
# the smoother in line 82-88 filters out prev seven days
44-
export_start_date = query_start_date + timedelta(days=6) if smoother == "smoothed" else query_start_date
45-
export_end_date = query_end_date - timedelta(days=global_max_expected_lag)
46-
num_export_days = (export_end_date - export_start_date).days + 1
47-
48-
return sorted([export_start_date + timedelta(days=x) for x in range(num_export_days)])
49+
# Smoothed signals drop the first 6 dates.
50+
return dates[6:21]
4951

5052
def mocked_patch(self, params_):
5153
with mock_patch("delphi_google_symptoms.patch.read_params", return_value=params_), \
@@ -58,9 +60,7 @@ def side_effect(*args, **kwargs):
5860
df = state_data_gap
5961
pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
6062
start_date, end_date = re.findall(pattern, args[0])
61-
end_date_w_lag = (datetime.strptime(end_date, "%Y-%m-%d") - timedelta(days=4)).strftime("%Y-%m-%d")
62-
return df[(df["date"] >= start_date) & \
63-
(df["date"] <= end_date_w_lag)]
63+
return df[(df["date"] >= start_date) & (df["date"] <= end_date)]
6464
else:
6565
return pd.DataFrame()
6666

@@ -80,11 +80,15 @@ def side_effect(*args, **kwargs):
8080

8181
assert smoothed_dates == expected_smoothed_dates
8282
assert raw_dates == expected_raw_dates
83+
8384
shutil.rmtree(issue_dir)
85+
8486
start_date += timedelta(days=1)
87+
8588
def test_patch_default(self, params_w_patch):
8689
params_w_patch["indicator"]["num_export_days"] = None
8790
self.mocked_patch(params_w_patch)
91+
8892
def test_patch_date_set(self, params_w_patch):
8993
self.mocked_patch(params_w_patch)
9094

google_symptoms/tests/test_pull.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def test_good_file(self, mock_credentials, mock_read_gbq):
4444
"20201230", "%Y%m%d")
4545
end_date = datetime.combine(date.today(), datetime.min.time())
4646

47-
dfs = pull_gs_data("", [start_date, end_date])
47+
dfs = pull_gs_data("", datetime.strptime(
48+
"20201230", "%Y%m%d"), datetime.combine(date.today(), datetime.min.time()), 0, False)
4849

4950
for level in ["county", "state"]:
5051
df = dfs[level]

0 commit comments

Comments
 (0)