|
| 1 | +# TODO: this file and the associated yaml file probably belong in src/common/covid_hosp/ or at least src/common/acquisition/covid_hosp/common/ |
| 2 | + |
| 3 | +from datetime import datetime |
| 4 | +import re |
| 5 | +import sys |
| 6 | + |
| 7 | +# UNCOMMENT: from delphi.epidata.acquisition.covid_hosp.common.utils import Utils |
1 | 8 |
|
2 | 9 | # ruamel preserves key ordering, comments, and some formatting for a "round trip" of a yaml file import-->export
|
3 | 10 | from ruamel.yaml.main import (
|
|
12 | 19 | # print(yaml_dump(yaml_load('NULL: ~'))) # ==> "~: ~\n"
|
13 | 20 |
|
14 | 21 |
|
| 22 | +class CovidHospSomething: |
| 23 | + |
| 24 | + YAML_FILENAME = 'covid_hosp_schemadefs.yaml' |
| 25 | + |
| 26 | + TYPE_MAPPING = { |
| 27 | + 'int': int, |
| 28 | + 'float': float, |
| 29 | + 'str': str, |
| 30 | + 'fixedstr', str, |
| 31 | + 'intdate': int, # UNCOMMENT: Utils.int_from_date, |
| 32 | + 'point': str, # UNCOMMENT: Utils.limited_geocode, |
| 33 | + } |
| 34 | + |
| 35 | + MYSQL_COL_NAME_PATTERN = re.compile(r'^[a-zA-Z0-9_]{3,64}$') |
| 36 | + |
| 37 | + |
| 38 | + def __init__(self): |
| 39 | + self.read_schemadefs() |
| 40 | + |
| 41 | + |
| 42 | + def read_schemadefs(self): |
| 43 | + # TODO: put the yaml file inside the package structure and access it there, with something like: |
| 44 | + # from importlib import resources |
| 45 | + # import delphi.epidata.common.covid_hosp |
| 46 | + # self.yaml_content = resources.read_text(delphi.epidata.common.covid_hosp, YAML_FILENAME) |
| 47 | + with open(CovidHospSomething.YAML_FILENAME, 'r') as yaml_file: |
| 48 | + self.yaml_content = yaml_load(yaml_file, preserve_quotes=True) |
| 49 | + return self.yaml_content |
| 50 | + |
| 51 | + |
| 52 | + def write_schemadefs(self, filename=CovidHospSomething.YAML_FILENAME): |
| 53 | + with open(filename, 'w') as yaml_file: |
| 54 | + # NOTE: `width` specification is to prevent dump from splitting long lines |
| 55 | + # TODO: consider `block_seq_indent=2` to make list under ORDERED_CSV_COLUMNS look a little better |
| 56 | + yaml_dump(self.yaml_content, yaml_file, width=200) |
| 57 | + |
| 58 | + |
| 59 | + def dataset_names(self): |
| 60 | + return self.yaml_content.keys() |
| 61 | + |
| 62 | + |
| 63 | + def dataset(self, ds_name): |
| 64 | + return self.yaml_content[ds_name] |
| 65 | + |
| 66 | + |
| 67 | + def columns(self, ds_name): |
| 68 | + for dtype_cplx, name, sql_name in self.dataset(ds_name)['ORDERED_CSV_COLUMNS']: |
| 69 | + if sql_name is None: |
| 70 | + sql_name = name |
| 71 | + if ':' in dtype_cplx: |
| 72 | + dtype, col_width = dtype_cplx.split(':') |
| 73 | + col_width = int(col_width) |
| 74 | + else: |
| 75 | + dtype = dtype_cplx |
| 76 | + col_width = None |
| 77 | + yield {'name': name, 'sql_name': sql_name, 'dtype': dtype, 'col_width:' col_width, 'marshaller': CovidHospSomething.TYPE_MAPPING[dtype]} |
| 78 | + |
| 79 | + |
| 80 | + def add_column(self, ds_name, col_name, dtype, sql_name=None, col_width=None): |
| 81 | + # if provided, append a column width to the type |
| 82 | + if col_width: |
| 83 | + dtype_cplx = f"{dtype}:{col_width}" |
| 84 | + else: |
| 85 | + dtype_cplx = dtype |
| 86 | + |
| 87 | + # verify name to be used for MySQL column is acceptable |
| 88 | + if sql_name is None: |
| 89 | + sql_name = col_name |
| 90 | + if not CovidHospSomething.MYSQL_COL_NAME_PATTERN.match(sql_name): |
| 91 | + raise Exception(f"Invalid name for MySQL column: {sql_name}") |
| 92 | + |
| 93 | + # TODO: check for name collisions here with self.columns(ds_name)['name', 'sql_name'] |
| 94 | + |
| 95 | + # for readability of resulting YAML output, dont repeat SQL column name when it matches original |
| 96 | + if col_name == sql_name: |
| 97 | + sql_name = None |
| 98 | + |
| 99 | + # add new column to end of current column list |
| 100 | + self.dataset(ds_name)['ORDERED_CSV_COLUMNS'].append([dtype_cplx, col_name, sql_name]) |
| 101 | + |
| 102 | + |
| 103 | + def detect_changes(self, ds_name): |
| 104 | + # TODO |
| 105 | + ds = self.dataset(ds_name) |
| 106 | + print("TODO download: " + ds['METADATA_ID']) |
| 107 | + # TODO: compute set difference: metadata.columns - ds.columns |
| 108 | + # TODO: foreach column in that diff: |
| 109 | + # guess at column type |
| 110 | + # if not MYSQL_COL_NAME_PATTERN.match(), suggest sql column name |
| 111 | + return [] |
| 112 | + |
| 113 | + |
| 114 | + def write_new_definitions(self): |
| 115 | + today_str = datetime.now().strftime("%Y_%m_%d") |
| 116 | + |
| 117 | + yaml_file = CovidHospSomething.YAML_FILENAME |
| 118 | + ddl_file = 'covid_hosp.sql' |
| 119 | + migration_file = f"covid_hosp_v{today_str}.sql" |
| 120 | + |
| 121 | + # TODO: do these with proper python `pathlib.Path` objects |
| 122 | + repo_root = './' |
| 123 | + yaml_file_directory = repo_root + 'src/ddl/' |
| 124 | + ddl_file_directory = repo_root + 'src/ddl/' |
| 125 | + migration_file_directory = repo_root + 'src/ddl/migrations/' |
| 126 | + |
| 127 | + # write updated yaml file |
| 128 | + self.write_schemadefs() |
| 129 | + |
| 130 | + # write newly generated sql definition file |
| 131 | + with open(ddl_file, 'w') as f: |
| 132 | + # TODO |
| 133 | + f.write("\n") |
| 134 | + |
| 135 | + # write migration file for new columns |
| 136 | + with open(migration_file, 'w') as f: |
| 137 | + # TODO |
| 138 | + f.write("\n") |
| 139 | + |
| 140 | + # move files into proper locations |
| 141 | + # UNCOMMENT: os.system(f"mv -f {yaml_file} {yaml_file_directory}") |
| 142 | + # UNCOMMENT: os.system("mv -f {ddl_file} {ddl_file_directory}") |
| 143 | + # UNCOMMENT: os.system("mv -f {migration_file} {migration_file_directory") |
15 | 144 |
|
16 |
| -def read_schemadefs(): |
17 |
| - with open("covid_hosp_schemadefs.yaml", 'r') as yaml_file: |
18 |
| - yaml_content = yaml_load(yaml_file, preserve_quotes=True) |
19 |
| - return yaml_content |
20 | 145 |
|
21 | 146 |
|
22 |
| -def write_schemadefs(yaml_content): |
23 |
| - # this block replaces duplicated column names with null/None/~ |
24 |
| - # (when same name is shared between SQL representation and python/source representation) |
25 |
| - for dataset in yaml_content: |
26 |
| - for col in yaml_content[dataset]['ORDERED_CSV_COLUMNS']: |
27 |
| - if col[0]==col[1]: |
28 |
| - col[1] = None |
29 |
| - with open("covid_hosp_schemadefs__out.yaml", 'w') as yaml_file: |
30 |
| - yaml_dump(yaml_content, yaml_file, width=200) |
31 |
| - # NOTE: `width` specification is to prevent dump from splitting long lines |
32 |
| - # TODO: consider `block_seq_indent=2` to make list under ORDERED_CSV_COLUMNS look a little better |
| 147 | +if __name__ == "__main__": |
| 148 | + chs = CovidHospSomething() |
| 149 | + changed = False |
33 | 150 |
|
| 151 | + for ds_name in chs.datasets(): |
| 152 | + ds = chs.dataset(ds_name) |
| 153 | + new_cols = chs.detect_changes(ds_name) |
| 154 | + if new_cols: |
| 155 | + changed = True |
| 156 | + for col in new_cols: |
| 157 | + chs.add_column(ds_name, col.name, col.dtype) |
34 | 158 |
|
35 |
| -yaml_content = read_schemadefs() |
36 |
| -write_schemadefs(yaml_content) |
| 159 | + if changed: |
| 160 | + chs.write_new_definitions() |
| 161 | + sys.exit(1) |
37 | 162 |
|
38 |
| -# TODO: figure out what to do wrt splitting lines (maybe split line inside ORDERED_CSV_COLUMNS list between column name and column rename?!) |
| 163 | + sys.exit(0) |
0 commit comments