Skip to content

Commit c16ed98

Browse files
melange396rzats
authored andcommitted
skeleton for yaml accessor class, untested and very incomplete
1 parent f611423 commit c16ed98

File tree

1 file changed

+143
-18
lines changed

1 file changed

+143
-18
lines changed

src/ddl/covid_hosp_schema_io.py

+143-18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# TODO: this file and the associated yaml file probably belong in src/common/covid_hosp/ or at least src/common/acquisition/covid_hosp/common/
2+
3+
from datetime import datetime
4+
import re
5+
import sys
6+
7+
# UNCOMMENT: from delphi.epidata.acquisition.covid_hosp.common.utils import Utils
18

29
# ruamel preserves key ordering, comments, and some formatting for a "round trip" of a yaml file import-->export
310
from ruamel.yaml.main import (
@@ -12,27 +19,145 @@
1219
# print(yaml_dump(yaml_load('NULL: ~'))) # ==> "~: ~\n"
1320

1421

22+
class CovidHospSomething:
23+
24+
YAML_FILENAME = 'covid_hosp_schemadefs.yaml'
25+
26+
TYPE_MAPPING = {
27+
'int': int,
28+
'float': float,
29+
'str': str,
30+
'fixedstr', str,
31+
'intdate': int, # UNCOMMENT: Utils.int_from_date,
32+
'point': str, # UNCOMMENT: Utils.limited_geocode,
33+
}
34+
35+
MYSQL_COL_NAME_PATTERN = re.compile(r'^[a-zA-Z0-9_]{3,64}$')
36+
37+
38+
def __init__(self):
39+
self.read_schemadefs()
40+
41+
42+
def read_schemadefs(self):
43+
# TODO: put the yaml file inside the package structure and access it there, with something like:
44+
# from importlib import resources
45+
# import delphi.epidata.common.covid_hosp
46+
# self.yaml_content = resources.read_text(delphi.epidata.common.covid_hosp, YAML_FILENAME)
47+
with open(CovidHospSomething.YAML_FILENAME, 'r') as yaml_file:
48+
self.yaml_content = yaml_load(yaml_file, preserve_quotes=True)
49+
return self.yaml_content
50+
51+
52+
def write_schemadefs(self, filename=CovidHospSomething.YAML_FILENAME):
53+
with open(filename, 'w') as yaml_file:
54+
# NOTE: `width` specification is to prevent dump from splitting long lines
55+
# TODO: consider `block_seq_indent=2` to make list under ORDERED_CSV_COLUMNS look a little better
56+
yaml_dump(self.yaml_content, yaml_file, width=200)
57+
58+
59+
def dataset_names(self):
60+
return self.yaml_content.keys()
61+
62+
63+
def dataset(self, ds_name):
64+
return self.yaml_content[ds_name]
65+
66+
67+
def columns(self, ds_name):
68+
for dtype_cplx, name, sql_name in self.dataset(ds_name)['ORDERED_CSV_COLUMNS']:
69+
if sql_name is None:
70+
sql_name = name
71+
if ':' in dtype_cplx:
72+
dtype, col_width = dtype_cplx.split(':')
73+
col_width = int(col_width)
74+
else:
75+
dtype = dtype_cplx
76+
col_width = None
77+
yield {'name': name, 'sql_name': sql_name, 'dtype': dtype, 'col_width:' col_width, 'marshaller': CovidHospSomething.TYPE_MAPPING[dtype]}
78+
79+
80+
def add_column(self, ds_name, col_name, dtype, sql_name=None, col_width=None):
81+
# if provided, append a column width to the type
82+
if col_width:
83+
dtype_cplx = f"{dtype}:{col_width}"
84+
else:
85+
dtype_cplx = dtype
86+
87+
# verify name to be used for MySQL column is acceptable
88+
if sql_name is None:
89+
sql_name = col_name
90+
if not CovidHospSomething.MYSQL_COL_NAME_PATTERN.match(sql_name):
91+
raise Exception(f"Invalid name for MySQL column: {sql_name}")
92+
93+
# TODO: check for name collisions here with self.columns(ds_name)['name', 'sql_name']
94+
95+
# for readability of resulting YAML output, dont repeat SQL column name when it matches original
96+
if col_name == sql_name:
97+
sql_name = None
98+
99+
# add new column to end of current column list
100+
self.dataset(ds_name)['ORDERED_CSV_COLUMNS'].append([dtype_cplx, col_name, sql_name])
101+
102+
103+
def detect_changes(self, ds_name):
104+
# TODO
105+
ds = self.dataset(ds_name)
106+
print("TODO download: " + ds['METADATA_ID'])
107+
# TODO: compute set difference: metadata.columns - ds.columns
108+
# TODO: foreach column in that diff:
109+
# guess at column type
110+
# if not MYSQL_COL_NAME_PATTERN.match(), suggest sql column name
111+
return []
112+
113+
114+
def write_new_definitions(self):
115+
today_str = datetime.now().strftime("%Y_%m_%d")
116+
117+
yaml_file = CovidHospSomething.YAML_FILENAME
118+
ddl_file = 'covid_hosp.sql'
119+
migration_file = f"covid_hosp_v{today_str}.sql"
120+
121+
# TODO: do these with proper python `pathlib.Path` objects
122+
repo_root = './'
123+
yaml_file_directory = repo_root + 'src/ddl/'
124+
ddl_file_directory = repo_root + 'src/ddl/'
125+
migration_file_directory = repo_root + 'src/ddl/migrations/'
126+
127+
# write updated yaml file
128+
self.write_schemadefs()
129+
130+
# write newly generated sql definition file
131+
with open(ddl_file, 'w') as f:
132+
# TODO
133+
f.write("\n")
134+
135+
# write migration file for new columns
136+
with open(migration_file, 'w') as f:
137+
# TODO
138+
f.write("\n")
139+
140+
# move files into proper locations
141+
# UNCOMMENT: os.system(f"mv -f {yaml_file} {yaml_file_directory}")
142+
# UNCOMMENT: os.system("mv -f {ddl_file} {ddl_file_directory}")
143+
# UNCOMMENT: os.system("mv -f {migration_file} {migration_file_directory")
15144

16-
def read_schemadefs():
17-
with open("covid_hosp_schemadefs.yaml", 'r') as yaml_file:
18-
yaml_content = yaml_load(yaml_file, preserve_quotes=True)
19-
return yaml_content
20145

21146

22-
def write_schemadefs(yaml_content):
23-
# this block replaces duplicated column names with null/None/~
24-
# (when same name is shared between SQL representation and python/source representation)
25-
for dataset in yaml_content:
26-
for col in yaml_content[dataset]['ORDERED_CSV_COLUMNS']:
27-
if col[0]==col[1]:
28-
col[1] = None
29-
with open("covid_hosp_schemadefs__out.yaml", 'w') as yaml_file:
30-
yaml_dump(yaml_content, yaml_file, width=200)
31-
# NOTE: `width` specification is to prevent dump from splitting long lines
32-
# TODO: consider `block_seq_indent=2` to make list under ORDERED_CSV_COLUMNS look a little better
147+
if __name__ == "__main__":
148+
chs = CovidHospSomething()
149+
changed = False
33150

151+
for ds_name in chs.datasets():
152+
ds = chs.dataset(ds_name)
153+
new_cols = chs.detect_changes(ds_name)
154+
if new_cols:
155+
changed = True
156+
for col in new_cols:
157+
chs.add_column(ds_name, col.name, col.dtype)
34158

35-
yaml_content = read_schemadefs()
36-
write_schemadefs(yaml_content)
159+
if changed:
160+
chs.write_new_definitions()
161+
sys.exit(1)
37162

38-
# TODO: figure out what to do wrt splitting lines (maybe split line inside ORDERED_CSV_COLUMNS list between column name and column rename?!)
163+
sys.exit(0)

0 commit comments

Comments
 (0)