Skip to content

Commit 54970d9

Browse files
committed
test: tweak covidcast_port tests a bit
1 parent e5c3b46 commit 54970d9

File tree

3 files changed

+55
-41
lines changed

3 files changed

+55
-41
lines changed

testing_utils/delphi_utils/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
covidcast_result

testing_utils/delphi_utils/check_covidcast_port.py

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,30 @@
11
"""
22
script to check converting covidcast api calls with Epidata.covidcast Epidata.covidcast_meta
33
"""
4-
import time
4+
55
from collections import defaultdict
66
from pathlib import Path
77
from typing import Union, Iterable, Tuple, List, Dict
88
from datetime import datetime, timedelta, date
99

10-
import numpy as np
1110
import pandas as pd
1211
import covidcast
12+
import tqdm
1313
from delphi_epidata import Epidata
1414
from pandas.testing import assert_frame_equal
1515
import os
1616
from epiweeks import Week
1717

18-
API_KEY = os.environ.get('DELPHI_API_KEY')
18+
API_KEY = os.environ.get("DELPHI_API_KEY", os.environ.get("DELPHI_EPIDATA_KEY"))
1919
covidcast.use_api_key(API_KEY)
20-
21-
Epidata.debug = True
2220
Epidata.auth = ("epidata", API_KEY)
23-
2421
CURRENT_DIR = Path(__file__).parent
22+
if not Path(f"{CURRENT_DIR}/covidcast_result").is_dir():
23+
os.mkdir(f"{CURRENT_DIR}/covidcast_result")
24+
# We will test the first X signals for each data source that we find from the
25+
# metadata endpoint with this variable.
26+
NUMBER_SIGNALS_PER_SOURCE = 5
27+
2528

2629
def _parse_datetimes(date_int: int, time_type: str, date_format: str = "%Y%m%d") -> Union[pd.Timestamp, None]:
2730
"""Convert a date or epiweeks string into timestamp objects.
@@ -151,22 +154,23 @@ def ported_signal(
151154

152155
return api_df
153156

154-
def check_metadata():
155157

158+
def check_metadata():
156159
expected_df = covidcast.metadata()
157160
df = ported_metadata()
158161
assert_frame_equal(expected_df, df)
159162

163+
160164
def ported_signal(
161-
data_source: str,
162-
signal: str, # pylint: disable=W0621
163-
start_day: date = None,
164-
end_day: date = None,
165-
geo_type: str = "county",
166-
geo_values: Union[str, Iterable[str]] = "*",
167-
as_of: date = None,
168-
lag: int = None,
169-
time_type: str = "day",
165+
data_source: str,
166+
signal: str, # pylint: disable=W0621
167+
start_day: date = None,
168+
end_day: date = None,
169+
geo_type: str = "county",
170+
geo_values: Union[str, Iterable[str]] = "*",
171+
as_of: date = None,
172+
lag: int = None,
173+
time_type: str = "day",
170174
) -> Union[pd.DataFrame, None]:
171175
"""
172176
Makes covidcast signal api call.
@@ -252,13 +256,16 @@ def generate_start_date_per_signal() -> Dict[Tuple[datetime, datetime, str], Lis
252256
Dict[Tuple[datetime.datetime, datetime.datetime, str],[List[Tuple[str, str]]]
253257
"""
254258
meta_df = pd.DataFrame.from_dict(Epidata.covidcast_meta()["epidata"])
255-
meta_df["min_time"] = meta_df["min_time"].astype('str')
259+
meta_df["min_time"] = meta_df["min_time"].astype("str")
260+
meta_df = meta_df.groupby("data_source").head(NUMBER_SIGNALS_PER_SOURCE)
256261
signal_timeframe_dict = defaultdict(list)
257262

258263
for start_str, data in meta_df.groupby("min_time"):
259-
260264
data_source_groups = data.groupby("data_source")
261265
for data_source, df in data_source_groups:
266+
# TODO: Remove after metadata bug is fixed.
267+
if data_source == "google-symptom":
268+
continue
262269
signals = list(df["signal"].unique())
263270
time_type = df["time_type"].values[0]
264271
for signal in signals:
@@ -274,8 +281,7 @@ def generate_start_date_per_signal() -> Dict[Tuple[datetime, datetime, str], Lis
274281
elif time_type == "week":
275282
start_time = Week(year=int(start_str[:4]), week=int(start_str[-2:]))
276283
end_time = (start_time + 2).startdate()
277-
date_range = (start_time.startdate(),
278-
end_time, time_type)
284+
date_range = (start_time.startdate(), end_time, time_type)
279285
signal_timeframe_dict[date_range].append((data_source, signal))
280286

281287
return signal_timeframe_dict
@@ -289,39 +295,51 @@ def check_signal():
289295
"""
290296
signal_timeframe_dict = generate_start_date_per_signal()
291297
signal_df_dict = dict()
292-
for date_range, data_source_signal_list in signal_timeframe_dict.items():
298+
for date_range, data_source_signal_list in tqdm.tqdm(signal_timeframe_dict.items()):
293299
for data_source, signal in data_source_signal_list:
294300
time_type = date_range[2]
295301
filename = f"{CURRENT_DIR}/covidcast_result/{data_source}_{signal}.parquet"
296302
if not Path(filename).is_file():
297303
# every signal except google-symptom has geo type of state
298304
geo_type = "state"
299-
if data_source == "google-symptom":
305+
if data_source == "google-symptoms":
300306
geo_type = "county"
301-
302-
expected_df = covidcast.signal(data_source, signal, start_day=date_range[0], end_day=date_range[1],
303-
geo_type=geo_type, time_type=time_type)
304-
if expected_df is None:
305-
raise RuntimeError("Data should exists")
307+
expected_df = covidcast.signal(
308+
data_source,
309+
signal,
310+
start_day=date_range[0],
311+
end_day=date_range[1],
312+
geo_type=geo_type,
313+
time_type=time_type,
314+
)
315+
assert not expected_df.empty, "Received no data from covidcast API."
306316

307317
expected_df.to_parquet(filename)
308318
signal_df_dict[(data_source, signal, time_type)] = filename
309319

310-
for date_range, data_source_signal_list in signal_timeframe_dict.items():
320+
for date_range, data_source_signal_list in tqdm.tqdm(signal_timeframe_dict.items()):
311321
for data_source, signal in data_source_signal_list:
312322
expected_filename = signal_df_dict.get((data_source, signal, date_range[2]))
313323
expected_df = pd.read_parquet(expected_filename)
314324

315325
# every signal except google-symptom has geo type of state
316326
geo_type = "state"
317-
if data_source == "google-symptom":
327+
if data_source == "google-symptoms":
318328
geo_type = "county"
319-
df = ported_signal(data_source, signal, start_day=date_range[0], end_day=date_range[1],
320-
time_type=date_range[2],
321-
geo_type=geo_type)
329+
df = ported_signal(
330+
data_source,
331+
signal,
332+
start_day=date_range[0],
333+
end_day=date_range[1],
334+
time_type=date_range[2],
335+
geo_type=geo_type,
336+
)
337+
assert not df.empty, "Received no data from covidcast API."
338+
322339
check = df.merge(expected_df, indicator=True)
323340
assert (check["_merge"] == "both").all()
324341

342+
325343
if __name__ == "__main__":
326344
check_metadata()
327-
check_signal()
345+
check_signal()
Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
1-
boto3
21
covidcast
32
delphi-epidata
4-
scs<3.2.6 # TODO: remove this ; it is a cvxpy dependency and the excluded version appears to break our jenkins build. see: https://github.com/cvxgrp/scs/issues/283
53
epiweeks
6-
importlib_resources>=1.3
7-
moto~=4.2.14
84
numpy
9-
pandas>=1.1.0
10-
pylint==2.8.3
11-
structlog
12-
xlrd
5+
pandas==1.5.3
6+
pyarrow
7+
tqdm

0 commit comments

Comments
 (0)