|
| 1 | +"""utility functions for date parsing.""" |
| 2 | + |
| 3 | +from datetime import date, datetime, timedelta |
| 4 | +from itertools import product |
| 5 | +from typing import Dict, List, Union |
| 6 | + |
| 7 | +import covidcast |
| 8 | +from delphi_utils.validator.utils import lag_converter |
| 9 | +from pandas import to_datetime |
| 10 | + |
| 11 | +from .constants import COMBINED_METRIC, FULL_BKFILL_START_DATE, PAD_DAYS, SMOOTHERS |
| 12 | + |
| 13 | + |
| 14 | +def generate_patch_dates(params: Dict) -> Dict[date, Dict[str, Union[date, int]]]: |
| 15 | + """ |
| 16 | + Generate date range for chunking backfilled dates. |
| 17 | +
|
| 18 | + Parameters |
| 19 | + ---------- |
| 20 | + params: dictionary parsed from params.json |
| 21 | +
|
| 22 | + Returns |
| 23 | + ------- |
| 24 | + dict(date: dict(export date range settings)) |
| 25 | + """ |
| 26 | + issue_date = datetime.strptime(params["patch"]["start_issue"], "%Y-%m-%d") |
| 27 | + end_date = datetime.strptime(params["patch"]["end_issue"], "%Y-%m-%d") |
| 28 | + num_export_days = params["validation"]["common"].get("span_length", 14) |
| 29 | + |
| 30 | + patch_dates = dict() |
| 31 | + while issue_date <= end_date: |
| 32 | + global_max_expected_lag = get_max_lag(params) |
| 33 | + export_end_date = issue_date - timedelta(days=global_max_expected_lag + 1) |
| 34 | + export_start_date = issue_date - timedelta(days=num_export_days + global_max_expected_lag + 1) |
| 35 | + |
| 36 | + patch_dates[issue_date] = { |
| 37 | + "export_start_date": export_start_date, |
| 38 | + "export_end_date": export_end_date, |
| 39 | + "num_export_days": num_export_days, |
| 40 | + } |
| 41 | + |
| 42 | + issue_date += timedelta(days=1) |
| 43 | + |
| 44 | + return patch_dates |
| 45 | + |
| 46 | + |
| 47 | +def get_max_lag(params: Dict) -> int: |
| 48 | + """Determine reporting lag for data source.""" |
| 49 | + max_expected_lag = lag_converter(params["validation"]["common"].get("max_expected_lag", {"all": 4})) |
| 50 | + return max(list(max_expected_lag.values())) |
| 51 | + |
| 52 | + |
| 53 | +def generate_num_export_days(params: Dict, logger) -> [int]: |
| 54 | + """ |
| 55 | + Generate dates for exporting based on current available data. |
| 56 | +
|
| 57 | + Parameters |
| 58 | +
|
| 59 | + ---------- |
| 60 | + params: dictionary parsed from params.json |
| 61 | +
|
| 62 | + Returns |
| 63 | + ------- |
| 64 | + num_export_days: int |
| 65 | + """ |
| 66 | + # If end_date not specified, use current date. |
| 67 | + export_end_date = datetime.strptime( |
| 68 | + params["indicator"].get("export_end_date", datetime.strftime(date.today(), "%Y-%m-%d")), "%Y-%m-%d" |
| 69 | + ) |
| 70 | + |
| 71 | + # Generate a list of signals we expect to produce |
| 72 | + sensor_names = set( |
| 73 | + "_".join([metric, smoother, "search"]) for metric, smoother in product(COMBINED_METRIC, SMOOTHERS) |
| 74 | + ) |
| 75 | + |
| 76 | + # Fetch metadata to check how recent each signal is |
| 77 | + covidcast.use_api_key(params["indicator"]["api_credentials"]) |
| 78 | + metadata = covidcast.metadata() |
| 79 | + # Filter to only those signals we currently want to produce for `google-symptoms` |
| 80 | + gs_metadata = metadata[(metadata.data_source == "google-symptoms") & (metadata.signal.isin(sensor_names))] |
| 81 | + |
| 82 | + num_export_days = params["indicator"]["num_export_days"] |
| 83 | + custom_run = False if not params["common"].get("custom_run") else params["common"].get("custom_run", False) |
| 84 | + |
| 85 | + if num_export_days is None and not custom_run: |
| 86 | + if sensor_names.difference(set(gs_metadata.signal)): |
| 87 | + # If any signal not in metadata yet, we need to backfill its full history. |
| 88 | + logger.warning("Signals missing in the epidata; backfilling full history") |
| 89 | + num_export_days = (export_end_date - FULL_BKFILL_START_DATE).days + 1 |
| 90 | + else: |
| 91 | + latest_date_diff = (datetime.today() - to_datetime(min(gs_metadata.max_time))).days + 1 |
| 92 | + global_max_expected_lag = get_max_lag(params) |
| 93 | + expected_date_diff = params["validation"]["common"].get("span_length", 14) + global_max_expected_lag |
| 94 | + |
| 95 | + if latest_date_diff > expected_date_diff: |
| 96 | + logger.info(f"Missing dates from: {to_datetime(min(gs_metadata.max_time)).date()}") |
| 97 | + |
| 98 | + num_export_days = expected_date_diff |
| 99 | + |
| 100 | + return num_export_days |
| 101 | + |
| 102 | + |
| 103 | +def generate_query_dates( |
| 104 | + export_start_date: date, export_end_date: date, num_export_days: int, custom_run_flag: bool |
| 105 | +) -> List[date]: |
| 106 | + """Produce date range to retrieve data for. |
| 107 | +
|
| 108 | + Calculate start of date range as a static offset from the end date. |
| 109 | + Pad date range by an additional `PAD_DAYS` days before the earliest date to |
| 110 | + produce data for calculating smoothed estimates. |
| 111 | +
|
| 112 | + Parameters |
| 113 | + ---------- |
| 114 | + export_start_date: date |
| 115 | + first date to retrieve data for |
| 116 | + export_end_date: date |
| 117 | + last date to retrieve data for |
| 118 | + num_export_days: int |
| 119 | + number of days before end date to export |
| 120 | + custom_run_flag: bool |
| 121 | + flag to indicate if the date should be taken from export or calculated based on if it's a patch or regular run |
| 122 | +
|
| 123 | + Returns |
| 124 | + ------- |
| 125 | + List[date, date] |
| 126 | + """ |
| 127 | + start_date = export_start_date |
| 128 | + if not custom_run_flag: |
| 129 | + start_date = export_end_date - timedelta(days=num_export_days) |
| 130 | + retrieve_dates = [start_date - timedelta(days=PAD_DAYS - 1), export_end_date] |
| 131 | + |
| 132 | + return retrieve_dates |
0 commit comments