|
26 | 26 | Created: 2020-08-06
|
27 | 27 | """
|
28 | 28 |
|
29 |
| -from argparse import ArgumentParser |
30 | 29 | from contextlib import contextmanager
|
31 | 30 | import filecmp
|
32 | 31 | from glob import glob
|
33 | 32 | from os import remove, replace
|
34 | 33 | from os.path import join, basename, abspath
|
35 | 34 | import shutil
|
| 35 | +import time |
36 | 36 | from typing import Tuple, List, Dict, Optional
|
37 | 37 |
|
38 | 38 | from boto3 import Session
|
|
42 | 42 | import pandas as pd
|
43 | 43 |
|
44 | 44 | from .utils import read_params
|
| 45 | +from .logger import get_structured_logger |
45 | 46 |
|
46 | 47 | Files = List[str]
|
47 | 48 | FileDiffMap = Dict[str, Optional[str]]
|
@@ -98,43 +99,57 @@ def diff_export_csv(
|
98 | 99 | after_df.loc[added_idx, :])
|
99 | 100 |
|
100 | 101 |
|
101 |
| -def run_module(archive_type: str, |
102 |
| - cache_dir: str, |
103 |
| - export_dir: str, |
104 |
| - **kwargs): |
105 |
| - """Build and run an ArchiveDiffer. |
| 102 | +def archiver_from_params(params): |
| 103 | + """Build an ArchiveDiffer from `params`. |
| 104 | +
|
| 105 | + The type of ArchiveDiffer constructed is inferred from the parameters. |
106 | 106 |
|
107 | 107 | Parameters
|
108 | 108 | ----------
|
109 |
| - archive_type: str |
110 |
| - Type of ArchiveDiffer to run. Must be one of ["git", "s3"] which correspond to |
111 |
| - `GitArchiveDiffer` and `S3ArchiveDiffer`, respectively. |
112 |
| - cache_dir: str |
113 |
| - The directory for storing most recent archived/uploaded CSVs to start diffing from. |
114 |
| - export_dir: str |
115 |
| - The directory with most recent exported CSVs to diff to. |
116 |
| - **kwargs: |
117 |
| - Keyword arguments corresponding to constructor arguments for the respective ArchiveDiffers. |
| 109 | + params: Dict[str, Dict[str, Any]] |
| 110 | + Dictionary of user-defined parameters with the following structure: |
| 111 | + - "common": |
| 112 | + - "export_dir": str, directory to which indicator output files have been exported |
| 113 | + - "archive": |
| 114 | + - "cache_dir": str, directory containing cached data from previous indicator runs |
| 115 | + - "branch_name" (required for git archiver): str, name of git branch |
| 116 | + - "override_dirty" (optional for git archiver): bool, whether to allow overwriting of |
| 117 | + untracked & uncommitted changes in `cache_dir` |
| 118 | + - "commit_partial_success" (optional for git archiver): bool, whether to still commit |
| 119 | + even if some files were not archived and staged due to `override_dirty=False` |
| 120 | + - "commit_message" (optional for git archiver): str, commit message to use |
| 121 | + - "bucket_name" (required for S3 archiver): str, name of S3 bucket to which to upload |
| 122 | + files |
| 123 | + - "indicator_prefix" (required for S3 archiver): str, S3 prefix for files from this |
| 124 | + indicator |
| 125 | + - "aws_credentials" (required for S3 archiver): Dict[str, str], authentication |
| 126 | + parameters for S3 to create a boto3.Session |
| 127 | +
|
| 128 | + Returns |
| 129 | + ------- |
| 130 | + ArchiveDiffer of the inferred type. |
118 | 131 | """
|
119 |
| - if archive_type == "git": |
120 |
| - arch_diff = GitArchiveDiffer(cache_dir, |
121 |
| - export_dir, |
122 |
| - kwargs["branch_name"], |
123 |
| - kwargs["override_dirty"], |
124 |
| - kwargs["commit_partial_success"], |
125 |
| - kwargs["commit_message"]) |
126 |
| - elif archive_type == "s3": |
127 |
| - arch_diff = S3ArchiveDiffer(cache_dir, |
128 |
| - export_dir, |
129 |
| - kwargs["bucket_name"], |
130 |
| - kwargs["indicator_prefix"], |
131 |
| - kwargs["aws_credentials"]) |
132 |
| - elif archive_type == "filesystem": |
133 |
| - arch_diff = FilesystemArchiveDiffer(cache_dir, |
134 |
| - export_dir) |
135 |
| - else: |
136 |
| - raise ValueError(f"No archive type named '{archive_type}'") |
137 |
| - arch_diff.run() |
| 132 | + if "archive" not in params: |
| 133 | + return None |
| 134 | + |
| 135 | + # Copy to kwargs to take advantage of default arguments to archiver |
| 136 | + kwargs = params["archive"] |
| 137 | + kwargs["export_dir"] = params["common"]["export_dir"] |
| 138 | + |
| 139 | + if "branch_name" in kwargs: |
| 140 | + return GitArchiveDiffer(**kwargs) |
| 141 | + |
| 142 | + if "bucket_name" in kwargs: |
| 143 | + assert "indicator_prefix" in kwargs, "Missing indicator_prefix in params" |
| 144 | + assert "aws_credentials" in kwargs, "Missing aws_credentials in params" |
| 145 | + return S3ArchiveDiffer(**kwargs) |
| 146 | + |
| 147 | + # Don't run the filesystem archiver if the user misspecified the archiving params |
| 148 | + assert set(kwargs.keys()) == set(["cache_dir", "export_dir"]),\ |
| 149 | + 'If you intended to run a filesystem archiver, please remove all options other than '\ |
| 150 | + '"cache_dir" from the "archive" params. Otherwise, please include either "branch_name" '\ |
| 151 | + 'or "bucket_name" to run the git or S3 archivers, respectively.' |
| 152 | + return FilesystemArchiveDiffer(**kwargs) |
138 | 153 |
|
139 | 154 |
|
140 | 155 | class ArchiveDiffer:
|
@@ -621,46 +636,26 @@ def update_cache(self):
|
621 | 636 | self._cache_updated = True
|
622 | 637 |
|
623 | 638 | if __name__ == "__main__":
|
624 |
| - parser = ArgumentParser() |
625 |
| - parser.add_argument("--archive_type", required=True, type=str, |
626 |
| - choices=["git", "s3", "filesystem"], |
627 |
| - help="Type of archive differ to use.") |
628 |
| - parser.add_argument("--indicator_prefix", type=str, default="", |
629 |
| - help="The prefix for S3 keys related to this indicator." |
630 |
| - " Required for `archive_type = 's3'") |
631 |
| - parser.add_argument("--branch_name", type=str, default="", |
632 |
| - help=" Branch to use for `archive_type` = 'git'.") |
633 |
| - parser.add_argument("--override_dirty", action="store_true", |
634 |
| - help="Whether to allow overwriting of untracked &" |
635 |
| - " uncommitted changes for `archive_type` = 'git'") |
636 |
| - parser.add_argument("--commit_partial_success", action="store_true", |
637 |
| - help="Whether to still commit for `archive_type` = " |
638 |
| - "'git' even if some files were not archived and " |
639 |
| - "staged due to `override_dirty` = False.") |
640 |
| - parser.add_argument("--commit_message", type=str, default="", |
641 |
| - help="Commit message for `archive_type` = 'git'") |
642 |
| - args = parser.parse_args() |
643 |
| - params = read_params() |
| 639 | + _params = read_params() |
644 | 640 |
|
645 | 641 | # Autodetect whether parameters have been factored hierarchically or not
|
646 | 642 | # See https://github.com/cmu-delphi/covidcast-indicators/issues/847
|
647 | 643 | # Once all indicators have their parameters factored in to "common", "indicator", "validation",
|
648 | 644 | # and "archive", this code will be obsolete.
|
649 |
| - if "archive" in params: |
650 |
| - archive_params = params["archive"] |
651 |
| - common_params = params["common"] |
652 |
| - else: |
653 |
| - archive_params = params |
654 |
| - common_params = params |
655 |
| - |
656 |
| - run_module(args.archive_type, |
657 |
| - archive_params["cache_dir"], |
658 |
| - common_params["export_dir"], |
659 |
| - aws_credentials=archive_params.get("aws_credentials", {}), |
660 |
| - branch_name=args.branch_name, |
661 |
| - bucket_name=archive_params.get("bucket_name", ""), |
662 |
| - commit_message=args.commit_message, |
663 |
| - commit_partial_success=args.commit_partial_success, |
664 |
| - indicator_prefix=args.indicator_prefix, |
665 |
| - override_dirty=args.override_dirty |
666 |
| - ) |
| 645 | + # |
| 646 | + # We assume that by virtue of invoking this module from the command line that the user intends |
| 647 | + # to run validation. Thus if the "archive" sub-object is not found, we interpret that to mean |
| 648 | + # the parameters have not be hierarchically refactored. |
| 649 | + if "archive" not in _params: |
| 650 | + _params = {"archive": _params, "common": _params} |
| 651 | + |
| 652 | + logger = get_structured_logger( |
| 653 | + __name__, filename=_params["common"].get("log_filename"), |
| 654 | + log_exceptions=_params["common"].get("log_exceptions", True)) |
| 655 | + start_time = time.time() |
| 656 | + |
| 657 | + archiver_from_params(_params).run() |
| 658 | + |
| 659 | + elapsed_time_in_seconds = round(time.time() - start_time, 2) |
| 660 | + logger.info("Completed archive run.", |
| 661 | + elapsed_time_in_seconds=elapsed_time_in_seconds) |
0 commit comments