From e7b69ebed725bac0f488b3151023cdf4f27ade35 Mon Sep 17 00:00:00 2001 From: george haff Date: Wed, 28 Jun 2023 17:25:14 -0400 Subject: [PATCH 1/5] allow http caching of metadata endpoint (untested) --- src/server/_printer.py | 18 +++-- src/server/endpoints/covidcast_meta.py | 97 ++++++++++++++------------ 2 files changed, 64 insertions(+), 51 deletions(-) diff --git a/src/server/_printer.py b/src/server/_printer.py index 6e32d7d43..8f55c563c 100644 --- a/src/server/_printer.py +++ b/src/server/_printer.py @@ -46,13 +46,14 @@ def __init__(self): self.result: int = -1 self._max_results: int = MAX_COMPATIBILITY_RESULTS if is_compatibility_mode() else MAX_RESULTS - def make_response(self, gen): + def make_response(self, gen, headers=None): return Response( gen, mimetype="application/json", + headers=headers, ) - def __call__(self, generator: Iterable[Dict[str, Any]]) -> Response: + def __call__(self, generator: Iterable[Dict[str, Any]], headers=None) -> Response: def gen(): self.result = -2 # no result, default response began = False @@ -84,7 +85,7 @@ def gen(): if r is not None: yield r - return self.make_response(stream_with_context(gen())) + return self.make_response(stream_with_context(gen()), headers=headers) @property def remaining_rows(self) -> int: @@ -223,8 +224,11 @@ def __init__(self, filename: Optional[str] = "epidata"): super(CSVPrinter, self).__init__() self._filename = filename - def make_response(self, gen): - headers = {"Content-Disposition": f"attachment; filename={self._filename}.csv"} if self._filename else {} + def make_response(self, gen, headers=None): + if headers is None: + headers = {} + if self._filename: + headers["Content-Disposition"] = f"attachment; filename={self._filename}.csv"} return Response(gen, mimetype="text/csv; charset=utf8", headers=headers) def _begin(self): @@ -296,8 +300,8 @@ class JSONLPrinter(APrinter): a printer class writing in JSONLines format """ - def make_response(self, gen): - return Response(gen, mimetype=" text/plain; charset=utf8") + def make_response(self, gen, headers=None): + return Response(gen, mimetype=" text/plain; charset=utf8", headers=headers) def _begin(self): if show_hard_api_key_warning(): diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index d10de18db..06f8decdb 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -26,57 +26,66 @@ def __str__(self): return f"{self.source}:{self.signal}" -def fetch_data( - time_types: Optional[List[str]], - geo_types: Optional[List[str]], - signals: Optional[List[SourceSignal]], -): - # complain if the cache is more than 75 minutes old - max_age = 75 * 60 - - row = db.execute( +# empty generator that never yields +def _nonerator(): + return + yield + + +@bp.route("/", methods=("GET", "POST")) +def handle(): + time_types = extract_strings("time_types") + signals = [SourceSignal(v) for v in (extract_strings("signals") or [])] + geo_types = extract_strings("geo_types") + + printer = create_printer(request.values.get("format")) + + metadata = db.execute( text( "SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age, epidata FROM covidcast_meta_cache LIMIT 1" ) ).fetchone() - if not row or not row["epidata"]: + if not metadata or "epidata" not in metadata: + # the db table `covidcast_meta_cache` has no rows get_structured_logger('server_api').warning("no data in covidcast_meta cache") - return - - age = row["age"] - if age > max_age and row["epidata"]: - get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age) - - epidata = loads(row["epidata"]) - - if not epidata: - return + return printer(_nonerator()) - def filter_row(row: Dict): - if time_types and row.get("time_type") not in time_types: - return False - if geo_types and row.get("geo_type") not in geo_types: - return False - if not signals: - return True - for signal in signals: - # match source and (signal or no signal or signal = *) - if row.get("data_source") == signal.source and ( - signal.signal == "*" or signal.signal == row.get("signal") - ): - return True - return False + metadata_list = loads(metadata["epidata"]) - for row in epidata: - if filter_row(row): - yield row + if not metadata_list: + # the db table has a row, but there is no metadata about any signals in it + get_structured_logger('server_api').warning("empty entry in covidcast_meta cache") + return printer(_nonerator()) + standard_age = 60 * 60 # expected metadata regeneration interval, in seconds (currently 60 mins) + # TODO: get this ^ from a config var? ideally, it should be set to the time between runs of + # src/acquisition/covidcast/covidcast_meta_cache_updater.py + age = metadata["age"] + if age > standard_age * 1.25: + # complain if the cache is too old (currently, more than 75 mins old) + get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age) -@bp.route("/", methods=("GET", "POST")) -def handle(): - time_types = extract_strings("time_types") - signals = [SourceSignal(v) for v in (extract_strings("signals") or [])] - geo_types = extract_strings("geo_types") - - return create_printer(request.values.get("format"))(filter_fields(fetch_data(time_types, geo_types, signals))) + def cache_entry_gen(): + for entry in metadata_list: + if time_types and row.get("time_type") not in time_types: + continue + if geo_types and row.get("geo_type") not in geo_types: + continue + if not signals: + yield entry + for signal in signals: + # match source and (signal or no signal or signal = *) + if row.get("data_source") == signal.source and ( + signal.signal == "*" or signal.signal == row.get("signal") + ): + yield entry + + return printer( + filter_fields(cache_entry_gen()), + headers={ + "Cache-Control": f"max-age={standard_age}, public", + "Age": f"{age}", + # TODO?: "Expires": f"{}", # superseded by Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expires + } + ) From 4ce0ff5f2f290ee2b65500fdea8a946aa74c77a2 Mon Sep 17 00:00:00 2001 From: george haff Date: Wed, 28 Jun 2023 17:33:50 -0400 Subject: [PATCH 2/5] remove unmatched curly paren --- src/server/_printer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/_printer.py b/src/server/_printer.py index 8f55c563c..336b650cc 100644 --- a/src/server/_printer.py +++ b/src/server/_printer.py @@ -228,7 +228,7 @@ def make_response(self, gen, headers=None): if headers is None: headers = {} if self._filename: - headers["Content-Disposition"] = f"attachment; filename={self._filename}.csv"} + headers["Content-Disposition"] = f"attachment; filename={self._filename}.csv" return Response(gen, mimetype="text/csv; charset=utf8", headers=headers) def _begin(self): From 9e83107fe51ae52ba57d1663c62d660b5c4f11c8 Mon Sep 17 00:00:00 2001 From: george haff Date: Wed, 28 Jun 2023 17:44:12 -0400 Subject: [PATCH 3/5] rename variables but forreals this time --- src/server/endpoints/covidcast_meta.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index 06f8decdb..fd0491314 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -68,16 +68,16 @@ def handle(): def cache_entry_gen(): for entry in metadata_list: - if time_types and row.get("time_type") not in time_types: + if time_types and entry.get("time_type") not in time_types: continue - if geo_types and row.get("geo_type") not in geo_types: + if geo_types and entry.get("geo_type") not in geo_types: continue if not signals: yield entry for signal in signals: # match source and (signal or no signal or signal = *) - if row.get("data_source") == signal.source and ( - signal.signal == "*" or signal.signal == row.get("signal") + if entry.get("data_source") == signal.source and ( + signal.signal == "*" or signal.signal == entry.get("signal") ): yield entry From d6a56cf6eda3deadacb35cd2e593abf173dacb80 Mon Sep 17 00:00:00 2001 From: george haff Date: Thu, 6 Jul 2023 19:57:02 -0400 Subject: [PATCH 4/5] added fudge factor, updated comments, removed staleness logging --- src/server/endpoints/covidcast_meta.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index fd0491314..a8c9e56e9 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -58,13 +58,19 @@ def handle(): get_structured_logger('server_api').warning("empty entry in covidcast_meta cache") return printer(_nonerator()) - standard_age = 60 * 60 # expected metadata regeneration interval, in seconds (currently 60 mins) - # TODO: get this ^ from a config var? ideally, it should be set to the time between runs of - # src/acquisition/covidcast/covidcast_meta_cache_updater.py + # the expected metadata regeneration interval in seconds, aka time between runs of + # src/acquisition/covidcast/covidcast_meta_cache_updater.py (currently 2h) + standard_age = 2 * 60 * 60 + # a short period when a client can continue to use this metadata even if its slightly stale, + # which also gives some padding if the md generation is running slow, + # and which also acts as a minimum cacheable time (currently 10 mins) + age_margin = 10 * 60 + # these should be updated if a stale cache will have undue impact on user activities, such as + # if we start updating the metadata table much more frequently and having up-to-the-minute + # metadata accuracy becomes important to users once more. + # TODO: get the above two values ^ from config vars? age = metadata["age"] - if age > standard_age * 1.25: - # complain if the cache is too old (currently, more than 75 mins old) - get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age) + reported_age = max(0, min(age, standard_age) - age_margin) def cache_entry_gen(): for entry in metadata_list: @@ -85,7 +91,7 @@ def cache_entry_gen(): filter_fields(cache_entry_gen()), headers={ "Cache-Control": f"max-age={standard_age}, public", - "Age": f"{age}", + "Age": f"{reported_age}", # TODO?: "Expires": f"{}", # superseded by Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expires } ) From 082cc757731d14c59be19c9562211452b2e57a90 Mon Sep 17 00:00:00 2001 From: george haff Date: Mon, 10 Jul 2023 15:30:03 -0400 Subject: [PATCH 5/5] appease sonarlint even though its somewhat mistaken --- src/server/endpoints/covidcast_meta.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index a8c9e56e9..52d0a06eb 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -28,8 +28,7 @@ def __str__(self): # empty generator that never yields def _nonerator(): - return - yield + return (x for x in []) @bp.route("/", methods=("GET", "POST"))