Skip to content

Commit 59385b9

Browse files
authored
http caching for metadata (#1222)
allow http caching of metadata endpoint (plus a small margin), and removed staleness logging
1 parent f7a279b commit 59385b9

File tree

2 files changed

+71
-53
lines changed

2 files changed

+71
-53
lines changed

src/server/_printer.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,14 @@ def __init__(self):
4646
self.result: int = -1
4747
self._max_results: int = MAX_COMPATIBILITY_RESULTS if is_compatibility_mode() else MAX_RESULTS
4848

49-
def make_response(self, gen):
49+
def make_response(self, gen, headers=None):
5050
return Response(
5151
gen,
5252
mimetype="application/json",
53+
headers=headers,
5354
)
5455

55-
def __call__(self, generator: Iterable[Dict[str, Any]]) -> Response:
56+
def __call__(self, generator: Iterable[Dict[str, Any]], headers=None) -> Response:
5657
def gen():
5758
self.result = -2 # no result, default response
5859
began = False
@@ -84,7 +85,7 @@ def gen():
8485
if r is not None:
8586
yield r
8687

87-
return self.make_response(stream_with_context(gen()))
88+
return self.make_response(stream_with_context(gen()), headers=headers)
8889

8990
@property
9091
def remaining_rows(self) -> int:
@@ -223,8 +224,11 @@ def __init__(self, filename: Optional[str] = "epidata"):
223224
super(CSVPrinter, self).__init__()
224225
self._filename = filename
225226

226-
def make_response(self, gen):
227-
headers = {"Content-Disposition": f"attachment; filename={self._filename}.csv"} if self._filename else {}
227+
def make_response(self, gen, headers=None):
228+
if headers is None:
229+
headers = {}
230+
if self._filename:
231+
headers["Content-Disposition"] = f"attachment; filename={self._filename}.csv"
228232
return Response(gen, mimetype="text/csv; charset=utf8", headers=headers)
229233

230234
def _begin(self):
@@ -296,8 +300,8 @@ class JSONLPrinter(APrinter):
296300
a printer class writing in JSONLines format
297301
"""
298302

299-
def make_response(self, gen):
300-
return Response(gen, mimetype=" text/plain; charset=utf8")
303+
def make_response(self, gen, headers=None):
304+
return Response(gen, mimetype=" text/plain; charset=utf8", headers=headers)
301305

302306
def _begin(self):
303307
if show_hard_api_key_warning():

src/server/endpoints/covidcast_meta.py

+60-46
Original file line numberDiff line numberDiff line change
@@ -26,51 +26,9 @@ def __str__(self):
2626
return f"{self.source}:{self.signal}"
2727

2828

29-
def fetch_data(
30-
time_types: Optional[List[str]],
31-
geo_types: Optional[List[str]],
32-
signals: Optional[List[SourceSignal]],
33-
):
34-
# complain if the cache is more than 75 minutes old
35-
max_age = 75 * 60
36-
37-
row = db.execute(
38-
text(
39-
"SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age, epidata FROM covidcast_meta_cache LIMIT 1"
40-
)
41-
).fetchone()
42-
43-
if not row or not row["epidata"]:
44-
get_structured_logger('server_api').warning("no data in covidcast_meta cache")
45-
return
46-
47-
age = row["age"]
48-
if age > max_age and row["epidata"]:
49-
get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age)
50-
51-
epidata = loads(row["epidata"])
52-
53-
if not epidata:
54-
return
55-
56-
def filter_row(row: Dict):
57-
if time_types and row.get("time_type") not in time_types:
58-
return False
59-
if geo_types and row.get("geo_type") not in geo_types:
60-
return False
61-
if not signals:
62-
return True
63-
for signal in signals:
64-
# match source and (signal or no signal or signal = *)
65-
if row.get("data_source") == signal.source and (
66-
signal.signal == "*" or signal.signal == row.get("signal")
67-
):
68-
return True
69-
return False
70-
71-
for row in epidata:
72-
if filter_row(row):
73-
yield row
29+
# empty generator that never yields
30+
def _nonerator():
31+
return (x for x in [])
7432

7533

7634
@bp.route("/", methods=("GET", "POST"))
@@ -79,4 +37,60 @@ def handle():
7937
signals = [SourceSignal(v) for v in (extract_strings("signals") or [])]
8038
geo_types = extract_strings("geo_types")
8139

82-
return create_printer(request.values.get("format"))(filter_fields(fetch_data(time_types, geo_types, signals)))
40+
printer = create_printer(request.values.get("format"))
41+
42+
metadata = db.execute(
43+
text(
44+
"SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age, epidata FROM covidcast_meta_cache LIMIT 1"
45+
)
46+
).fetchone()
47+
48+
if not metadata or "epidata" not in metadata:
49+
# the db table `covidcast_meta_cache` has no rows
50+
get_structured_logger('server_api').warning("no data in covidcast_meta cache")
51+
return printer(_nonerator())
52+
53+
metadata_list = loads(metadata["epidata"])
54+
55+
if not metadata_list:
56+
# the db table has a row, but there is no metadata about any signals in it
57+
get_structured_logger('server_api').warning("empty entry in covidcast_meta cache")
58+
return printer(_nonerator())
59+
60+
# the expected metadata regeneration interval in seconds, aka time between runs of
61+
# src/acquisition/covidcast/covidcast_meta_cache_updater.py (currently 2h)
62+
standard_age = 2 * 60 * 60
63+
# a short period when a client can continue to use this metadata even if its slightly stale,
64+
# which also gives some padding if the md generation is running slow,
65+
# and which also acts as a minimum cacheable time (currently 10 mins)
66+
age_margin = 10 * 60
67+
# these should be updated if a stale cache will have undue impact on user activities, such as
68+
# if we start updating the metadata table much more frequently and having up-to-the-minute
69+
# metadata accuracy becomes important to users once more.
70+
# TODO: get the above two values ^ from config vars?
71+
age = metadata["age"]
72+
reported_age = max(0, min(age, standard_age) - age_margin)
73+
74+
def cache_entry_gen():
75+
for entry in metadata_list:
76+
if time_types and entry.get("time_type") not in time_types:
77+
continue
78+
if geo_types and entry.get("geo_type") not in geo_types:
79+
continue
80+
if not signals:
81+
yield entry
82+
for signal in signals:
83+
# match source and (signal or no signal or signal = *)
84+
if entry.get("data_source") == signal.source and (
85+
signal.signal == "*" or signal.signal == entry.get("signal")
86+
):
87+
yield entry
88+
89+
return printer(
90+
filter_fields(cache_entry_gen()),
91+
headers={
92+
"Cache-Control": f"max-age={standard_age}, public",
93+
"Age": f"{reported_age}",
94+
# TODO?: "Expires": f"{}", # superseded by Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expires
95+
}
96+
)

0 commit comments

Comments
 (0)