|
| 1 | +""" |
| 2 | +This module is used for patching data in the delphi_nssp package. |
| 3 | +
|
| 4 | +The code assume user can use key-based auth to access prod server |
| 5 | +where historical source data is stored. |
| 6 | +
|
| 7 | +To use this module, configure params.json like so: |
| 8 | +
|
| 9 | +{ |
| 10 | + "common": { |
| 11 | + "custom_run": true, |
| 12 | + ... |
| 13 | + }, |
| 14 | + "validation": { |
| 15 | + ... |
| 16 | + }, |
| 17 | + "patch": { |
| 18 | + "source_host": "prod.server.edu", |
| 19 | + "source_dir": "delphi/covidcast-indicators/nssp/source_data", |
| 20 | + "user": "username", |
| 21 | + "patch_dir": "delphi/covidcast-indicators/nssp/AprilPatch", |
| 22 | + "start_issue": "2024-04-20", |
| 23 | + "end_issue": "2024-04-21", |
| 24 | + } |
| 25 | +} |
| 26 | +
|
| 27 | +In this params.json, we |
| 28 | +- Turn on the "custom_run" flag under "common" |
| 29 | +- Add "patch" section, which contains: |
| 30 | + + "source_host": the prod server where source data is backed up |
| 31 | + + "source_dir": the local directory where source data is downloaded to |
| 32 | + + "user": the username to log in to the remote server where source data is backed up |
| 33 | + + "patch_dir": the local directory where to write all patch issues output |
| 34 | + + "start_date": str, YYYY-MM-DD format, first issue date |
| 35 | + + "end_date": str, YYYY-MM-DD format, last issue date |
| 36 | +
|
| 37 | +if "source_dir" doesn't exist locally or has no files in it, we download source data to source_dir |
| 38 | +else, we assume all needed source files are already in source_dir. |
| 39 | +
|
| 40 | +This module will generate data for that range of issue dates, and store them in batch issue format in the patch_dir: |
| 41 | +[patch_dir]/issue_[issue-date]/nssp/actual_data_file.csv |
| 42 | +""" |
| 43 | + |
| 44 | +import sys |
| 45 | +from datetime import datetime, timedelta |
| 46 | +from os import listdir, makedirs, path |
| 47 | +from shutil import rmtree |
| 48 | + |
| 49 | +import pandas as pd |
| 50 | +from delphi_utils import get_structured_logger, read_params |
| 51 | +from epiweeks import Week |
| 52 | + |
| 53 | +from .pull import get_source_data |
| 54 | +from .run import run_module |
| 55 | + |
| 56 | + |
| 57 | +def good_patch_config(params, logger): |
| 58 | + """ |
| 59 | + Check if the params.json file is correctly configured for patching. |
| 60 | +
|
| 61 | + params: Dict[str, Any] |
| 62 | + Nested dictionary of parameters, typically loaded from params.json file. |
| 63 | + logger: Logger object |
| 64 | + Logger object to log messages. |
| 65 | + """ |
| 66 | + valid_config = True |
| 67 | + custom_run = params["common"].get("custom_run", False) |
| 68 | + if not custom_run: |
| 69 | + logger.error("Calling patch.py without custom_run flag set true.") |
| 70 | + valid_config = False |
| 71 | + |
| 72 | + patch_config = params.get("patch", {}) |
| 73 | + if patch_config == {}: |
| 74 | + logger.error("Custom flag is on, but patch section is missing.") |
| 75 | + valid_config = False |
| 76 | + else: |
| 77 | + required_patch_keys = ["start_issue", "end_issue", "patch_dir", "source_dir"] |
| 78 | + |
| 79 | + source_dir = params["patch"]["source_dir"] |
| 80 | + if not path.isdir(source_dir) or not listdir(source_dir): |
| 81 | + required_patch_keys.append("user") |
| 82 | + required_patch_keys.append("source_host") |
| 83 | + |
| 84 | + missing_keys = [key for key in required_patch_keys if key not in patch_config] |
| 85 | + if missing_keys: |
| 86 | + logger.error("Patch section is missing required key(s)", missing_keys=missing_keys) |
| 87 | + valid_config = False |
| 88 | + else: |
| 89 | + try: # issue dates validity check |
| 90 | + start_issue = datetime.strptime(patch_config["start_issue"], "%Y-%m-%d") |
| 91 | + end_issue = datetime.strptime(patch_config["end_issue"], "%Y-%m-%d") |
| 92 | + if start_issue > end_issue: |
| 93 | + logger.error("Start issue date is after end issue date.") |
| 94 | + valid_config = False |
| 95 | + except ValueError: |
| 96 | + logger.error("Issue dates must be in YYYY-MM-DD format.") |
| 97 | + valid_config = False |
| 98 | + |
| 99 | + if valid_config: |
| 100 | + logger.info("Good patch configuration.") |
| 101 | + return True |
| 102 | + logger.info("Bad patch configuration.") |
| 103 | + return False |
| 104 | + |
| 105 | + |
| 106 | +def get_patch_dates(start_issue, end_issue, source_dir): |
| 107 | + """ |
| 108 | + Get the dates to run patch on given a range of issue dates. |
| 109 | +
|
| 110 | + Due to weekly cadence of nssp data, dates to run patch on are not necessarily the same as issue dates. |
| 111 | + We use the latest date with source data per epiweek as reporting date for patching of that week's data. |
| 112 | +
|
| 113 | + start_issue: datetime object |
| 114 | + end_issue: datetime object |
| 115 | + """ |
| 116 | + patch_dates = [] |
| 117 | + date_range = pd.date_range(start=start_issue, end=end_issue) |
| 118 | + dates_with_source_data = { |
| 119 | + date for date in date_range if path.isfile(f"""{source_dir}/{date.strftime("%Y%m%d")}.csv.gz""") |
| 120 | + } |
| 121 | + epiweek_start_dates = {Week.fromdate(date).startdate() for date in date_range} |
| 122 | + for epiweek_start_date in epiweek_start_dates: |
| 123 | + epiweek = Week.fromdate(epiweek_start_date) |
| 124 | + dates_with_data_in_epiweek = [date for date in dates_with_source_data if date.date() in epiweek.iterdates()] |
| 125 | + if dates_with_data_in_epiweek == []: |
| 126 | + continue |
| 127 | + latest_date_with_data = max(dates_with_data_in_epiweek) |
| 128 | + patch_dates.append(latest_date_with_data) |
| 129 | + patch_dates.sort() |
| 130 | + return patch_dates |
| 131 | + |
| 132 | + |
| 133 | +def patch(): |
| 134 | + """Run nssp indicator for a range of issue dates.""" |
| 135 | + params = read_params() |
| 136 | + logger = get_structured_logger("delphi_nssp.patch", filename=params["common"]["log_filename"]) |
| 137 | + if not good_patch_config(params, logger): |
| 138 | + sys.exit(1) |
| 139 | + |
| 140 | + source_dir = params["patch"]["source_dir"] |
| 141 | + download_source = False |
| 142 | + if not path.isdir(source_dir) or not listdir(source_dir): # no source dir or empty source dir |
| 143 | + download_source = True |
| 144 | + get_source_data(params, logger) |
| 145 | + else: |
| 146 | + logger.info("Source data already exists locally.") |
| 147 | + |
| 148 | + start_issue = datetime.strptime(params["patch"]["start_issue"], "%Y-%m-%d") |
| 149 | + end_issue = datetime.strptime(params["patch"]["end_issue"], "%Y-%m-%d") |
| 150 | + |
| 151 | + logger.info(start_issue=start_issue.strftime("%Y-%m-%d")) |
| 152 | + logger.info(end_issue=end_issue.strftime("%Y-%m-%d")) |
| 153 | + logger.info(source_dir=source_dir) |
| 154 | + logger.info(patch_dir=params["patch"]["patch_dir"]) |
| 155 | + makedirs(params["patch"]["patch_dir"], exist_ok=True) |
| 156 | + |
| 157 | + patch_dates = get_patch_dates(start_issue, end_issue, source_dir) |
| 158 | + |
| 159 | + for current_issue in patch_dates: |
| 160 | + logger.info("patching issue", issue_date=current_issue.strftime("%Y%m%d")) |
| 161 | + |
| 162 | + current_issue_source_csv = f"""{source_dir}/{current_issue.strftime("%Y%m%d")}.csv.gz""" |
| 163 | + if not path.isfile(current_issue_source_csv): |
| 164 | + logger.info("No source data at this path", current_issue_source_csv=current_issue_source_csv) |
| 165 | + current_issue += timedelta(days=1) |
| 166 | + continue |
| 167 | + |
| 168 | + params["patch"]["current_issue"] = current_issue.strftime("%Y%m%d") |
| 169 | + |
| 170 | + # current_issue_date can be different from params["patch"]["current_issue"] |
| 171 | + # due to weekly cadence of nssp data. For weekly sources, issue dates in our |
| 172 | + # db matches with first date of epiweek that the reporting date falls in, |
| 173 | + # rather than reporting date itself. |
| 174 | + current_issue_date = Week.fromdate(current_issue).startdate() |
| 175 | + current_issue_dir = f"""{params["patch"]["patch_dir"]}/issue_{current_issue_date.strftime("%Y%m%d")}/nssp""" |
| 176 | + makedirs(f"{current_issue_dir}", exist_ok=True) |
| 177 | + params["common"]["export_dir"] = f"""{current_issue_dir}""" |
| 178 | + |
| 179 | + run_module(params, logger) |
| 180 | + |
| 181 | + if download_source: |
| 182 | + rmtree(source_dir) |
| 183 | + |
| 184 | + |
| 185 | +if __name__ == "__main__": |
| 186 | + patch() |
0 commit comments