Skip to content

http caching for metadata #1222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions src/server/_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,14 @@ def __init__(self):
self.result: int = -1
self._max_results: int = MAX_COMPATIBILITY_RESULTS if is_compatibility_mode() else MAX_RESULTS

def make_response(self, gen):
def make_response(self, gen, headers=None):
return Response(
gen,
mimetype="application/json",
headers=headers,
)

def __call__(self, generator: Iterable[Dict[str, Any]]) -> Response:
def __call__(self, generator: Iterable[Dict[str, Any]], headers=None) -> Response:
def gen():
self.result = -2 # no result, default response
began = False
Expand Down Expand Up @@ -84,7 +85,7 @@ def gen():
if r is not None:
yield r

return self.make_response(stream_with_context(gen()))
return self.make_response(stream_with_context(gen()), headers=headers)

@property
def remaining_rows(self) -> int:
Expand Down Expand Up @@ -223,8 +224,11 @@ def __init__(self, filename: Optional[str] = "epidata"):
super(CSVPrinter, self).__init__()
self._filename = filename

def make_response(self, gen):
headers = {"Content-Disposition": f"attachment; filename={self._filename}.csv"} if self._filename else {}
def make_response(self, gen, headers=None):
if headers is None:
headers = {}
if self._filename:
headers["Content-Disposition"] = f"attachment; filename={self._filename}.csv"
return Response(gen, mimetype="text/csv; charset=utf8", headers=headers)

def _begin(self):
Expand Down Expand Up @@ -296,8 +300,8 @@ class JSONLPrinter(APrinter):
a printer class writing in JSONLines format
"""

def make_response(self, gen):
return Response(gen, mimetype=" text/plain; charset=utf8")
def make_response(self, gen, headers=None):
return Response(gen, mimetype=" text/plain; charset=utf8", headers=headers)

def _begin(self):
if show_hard_api_key_warning():
Expand Down
106 changes: 60 additions & 46 deletions src/server/endpoints/covidcast_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,51 +26,9 @@ def __str__(self):
return f"{self.source}:{self.signal}"


def fetch_data(
time_types: Optional[List[str]],
geo_types: Optional[List[str]],
signals: Optional[List[SourceSignal]],
):
# complain if the cache is more than 75 minutes old
max_age = 75 * 60

row = db.execute(
text(
"SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age, epidata FROM covidcast_meta_cache LIMIT 1"
)
).fetchone()

if not row or not row["epidata"]:
get_structured_logger('server_api').warning("no data in covidcast_meta cache")
return

age = row["age"]
if age > max_age and row["epidata"]:
get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age)

epidata = loads(row["epidata"])

if not epidata:
return

def filter_row(row: Dict):
if time_types and row.get("time_type") not in time_types:
return False
if geo_types and row.get("geo_type") not in geo_types:
return False
if not signals:
return True
for signal in signals:
# match source and (signal or no signal or signal = *)
if row.get("data_source") == signal.source and (
signal.signal == "*" or signal.signal == row.get("signal")
):
return True
return False

for row in epidata:
if filter_row(row):
yield row
# empty generator that never yields
def _nonerator():
return (x for x in [])


@bp.route("/", methods=("GET", "POST"))
Expand All @@ -79,4 +37,60 @@ def handle():
signals = [SourceSignal(v) for v in (extract_strings("signals") or [])]
geo_types = extract_strings("geo_types")

return create_printer(request.values.get("format"))(filter_fields(fetch_data(time_types, geo_types, signals)))
printer = create_printer(request.values.get("format"))

metadata = db.execute(
text(
"SELECT UNIX_TIMESTAMP(NOW()) - timestamp AS age, epidata FROM covidcast_meta_cache LIMIT 1"
)
).fetchone()

if not metadata or "epidata" not in metadata:
# the db table `covidcast_meta_cache` has no rows
get_structured_logger('server_api').warning("no data in covidcast_meta cache")
return printer(_nonerator())

metadata_list = loads(metadata["epidata"])

if not metadata_list:
# the db table has a row, but there is no metadata about any signals in it
get_structured_logger('server_api').warning("empty entry in covidcast_meta cache")
return printer(_nonerator())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: be nice to robot

if we wanted to not make sonarLint nervous we could replace this with

Suggested change
return printer(_nonerator())
return printer(x for x in [])

via experimentation:

>>> def foo():
...     return
...     yield
... 
>>> type(foo())
<class 'generator'>
>>> type(x for x in [])
<class 'generator'>
>>> list(foo())
[]
>>> list(x for x in [])
[]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alternately (since _nonerator is used on both L52 and L59) we could refactor to pull out the printer instead of just the generator:

def _null_printer():
    return printer(x for x in [])

...
    if not metadata_list:
        # the db table has a row, but there is no metadata about any signals in it
        get_structured_logger('server_api').warning("empty entry in covidcast_meta cache")
        return _null_printer()

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe im missing something, but i cant find where sonar flagged this...?

to convey intent, i prefer a named item over a weird empty comprehension just hanging out there. to that end, the _null_printer() could work. we might even want to put it into the printer module.

Copy link
Contributor

@krivard krivard Jul 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added an exception so it would build. the condition it fails is "no unreachable code" and the yield is unreachable.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as i said in another comment, its reachable enough for python to consider this a generator... but regardless, i changed it to the alternative empty generator.


# the expected metadata regeneration interval in seconds, aka time between runs of
# src/acquisition/covidcast/covidcast_meta_cache_updater.py (currently 2h)
standard_age = 2 * 60 * 60
# a short period when a client can continue to use this metadata even if its slightly stale,
# which also gives some padding if the md generation is running slow,
# and which also acts as a minimum cacheable time (currently 10 mins)
age_margin = 10 * 60
# these should be updated if a stale cache will have undue impact on user activities, such as
# if we start updating the metadata table much more frequently and having up-to-the-minute
# metadata accuracy becomes important to users once more.
# TODO: get the above two values ^ from config vars?
age = metadata["age"]
reported_age = max(0, min(age, standard_age) - age_margin)

def cache_entry_gen():
for entry in metadata_list:
if time_types and entry.get("time_type") not in time_types:
continue
if geo_types and entry.get("geo_type") not in geo_types:
continue
if not signals:
yield entry
for signal in signals:
# match source and (signal or no signal or signal = *)
if entry.get("data_source") == signal.source and (
signal.signal == "*" or signal.signal == entry.get("signal")
):
yield entry

return printer(
filter_fields(cache_entry_gen()),
headers={
"Cache-Control": f"max-age={standard_age}, public",
"Age": f"{reported_age}",
# TODO?: "Expires": f"{}", # superseded by Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expires
}
)