Merge pull request #14 from Big-Life-Lab/code-fixes

zargot · web-flow · commit 9c426b7c975c · 2024-06-28T18:01:56.000-04:00
misc CLI fixes
diff --git a/docs/tech-spec.md b/docs/tech-spec.md
@@ -59,6 +59,10 @@ Options:
     output files. This shows which tables and columns are selected, and how
     many rows each filter returns.
 
+- `-q`, `--quiet`:
+
+    don't log to STDOUT
+
 One or multiple sharable output files will be created in the chosen output
 directory according to the chosen output format and organization(s). Each
 output file will have the input filename followed by a postfix with the org
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,9 @@ dynamic = ["dependencies"]
 "Homepage" = "https://github.com/Big-Life-Lab/PHES-ODM-sharing"
 "Bug Tracker" = "https://github.com/Big-Life-Lab/PHES-ODM-sharing/issues"
 
+[project.scripts]
+odm-share = "odm_sharing.tools.share:main"
+
 [build-system]
 requires = ["hatchling", "hatch-requirements-txt"]
 build-backend = "hatchling.build"
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 SQLAlchemy==2.0.29
+numpy==1.24.4
 openpyxl==3.1.2
 pandas==2.0.3
 pyfunctional==1.5.0
diff --git a/src/odm_sharing/private/cons.py b/src/odm_sharing/private/cons.py
@@ -1,37 +1,60 @@
-from typing import List, cast
+import logging
+from pathlib import Path
+from typing import List, Set
 
 import pandas as pd
 import sqlalchemy as sa
 
 
-Connection = object  # opaque data-source connection handle
+Connection = sa.engine.Engine
 
 
 class DataSourceError(Exception):
     pass
 
 
-def _connect_excel(path: str, tables: List[str]) -> Connection:
-    ''':raises OSError:'''
-    # copies excel data to in-memory db, to abstract everything as a db
-    print('importing excel workbook')
-    table_whitelist = set(tables)
-    db = sa.create_engine('sqlite://', echo=False)
+def _create_memory_db() -> sa.engine.Engine:
+    return sa.create_engine('sqlite://', echo=False)
+
+
+def _write_table_to_db(db: sa.engine.Engine, table: str, df: pd.DataFrame
+                       ) -> None:
+    logging.info(f'- table {table}')
+    df.to_sql(table, db, index=False, if_exists='replace')
+
+
+def _connect_csv(path: str) -> Connection:
+    '''copies file data to in-memory db
+
+    :raises OSError:'''
+    logging.info('importing csv file')
+    table = Path(path).stem
+    db = _create_memory_db()
+    df = pd.read_csv(path)
+    _write_table_to_db(db, table, df)
+    return db
+
+
+def _connect_excel(path: str, table_whitelist: Set[str]) -> Connection:
+    '''copies file data to in-memory db
+
+    :raises OSError:'''
+    logging.info('importing excel workbook')
+    db = _create_memory_db()
     xl = pd.ExcelFile(path)
     included_tables = set(map(str, xl.sheet_names)) & table_whitelist
     for table in included_tables:
-        print(f'- table {table}')
         df = xl.parse(sheet_name=table)
-        df.to_sql(table, db, index=False, if_exists='replace')
-    return cast(Connection, db)
+        _write_table_to_db(db, table, df)
+    return db
 
 
 def _connect_db(url: str) -> Connection:
     ''':raises sa.exc.OperationalError:'''
     return sa.create_engine(url)
 
 
-def connect(data_source: str, tables: List[str] = []) -> Connection:
+def connect(data_source: str, tables: Set[str] = set()) -> Connection:
     '''
     connects to a data source and returns the connection
 
@@ -41,7 +64,9 @@ def connect(data_source: str, tables: List[str] = []) -> Connection:
     :raises DataSourceError:
     '''
     try:
-        if data_source.endswith('.xlsx'):
+        if data_source.endswith('.csv'):
+            return _connect_csv(data_source)
+        elif data_source.endswith('.xlsx'):
             return _connect_excel(data_source, tables)
         else:
             return _connect_db(data_source)
@@ -51,16 +76,15 @@ def connect(data_source: str, tables: List[str] = []) -> Connection:
 
 def get_dialect_name(c: Connection) -> str:
     '''returns the name of the dialect used for the connection'''
-    return cast(sa.engine.Engine, c).dialect.name
+    return c.dialect.name
 
 
 def exec(c: Connection, sql: str, sql_args: List[str] = []) -> pd.DataFrame:
     '''executes sql with args on connection
 
     :raises DataSourceError:
     '''
-    db = cast(sa.engine.Engine, c)
     try:
-        return pd.read_sql_query(sql, db, params=tuple(sql_args))
+        return pd.read_sql_query(sql, c, params=tuple(sql_args))
     except sa.exc.OperationalError as e:
         raise DataSourceError(str(e))
diff --git a/src/odm_sharing/private/stdext.py b/src/odm_sharing/private/stdext.py
@@ -6,12 +6,13 @@
 class StrEnum(str, Enum):
     '''shim for python < 3.11
 
-    Provides a ``__str__()`` function that returns the enum string-value, which
-    is useful for printing the value or comparing it with another string.
+    Gives the enum's assigned string value when converted to string, which is
+    useful for printing the value or comparing it with another string.
 
     See https://docs.python.org/3.11/library/enum.html#enum.StrEnum
     '''
-    pass
+    def __str__(self) -> str:
+        return str(self.value)
 
 
 class StrValueEnum(StrEnum):
diff --git a/src/odm_sharing/sharing.py b/src/odm_sharing/sharing.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple
 
 import pandas as pd
 from functional import seq
@@ -29,7 +29,7 @@ def parse(schema_path: str, orgs: List[str] = []) -> OrgTableQueries:
     return queries.generate(tree)
 
 
-def connect(data_source: str, tables: List[str] = []) -> Connection:
+def connect(data_source: str, tables: Set[str] = set()) -> Connection:
     '''returns a connection object that can be used together with a query
     object to retrieve data from `data_source`
 
@@ -67,7 +67,7 @@ def get_columns(c: Connection, tq: TableQuery
     if tq.columns:
         return (tq.select_rule_id, tq.columns)
     else:
-        dialect = queries.SqlDialect(cons.get_dialect_name(c))
+        dialect = queries.parse_sql_dialect(cons.get_dialect_name(c))
         sql = queries.get_column_sql(tq, dialect)
         columns = cons.exec(c, sql).columns.array.tolist()
         return (tq.select_rule_id, columns)
diff --git a/src/odm_sharing/tools/share.py b/src/odm_sharing/tools/share.py
@@ -1,14 +1,15 @@
 import contextlib
+import logging
 import os
+import sys
 from enum import Enum
 from os import linesep
 from pathlib import Path
-from typing import Dict, List, Optional, Set, TextIO
+from typing import Dict, List, Optional, Set, TextIO, Union
 from typing_extensions import Annotated
 
 import pandas as pd
 import typer
-import sqlalchemy as sa
 from tabulate import tabulate
 from functional import seq
 
@@ -24,6 +25,7 @@
 
 class OutFmt(str, Enum):
     '''output format'''
+    AUTO = 'auto'
     CSV = 'csv'
     EXCEL = 'excel'
 
@@ -41,10 +43,23 @@ class OutFmt(str, Enum):
 creating sharable output files. This shows which tables and columns are
 selected, and how many rows each filter returns.'''
 
+QUIET_DESC = 'Don\'t log to STDOUT.'
+
+# default cli args
+DEBUG_DEFAULT = False
+ORGS_DEFAULT = []
+OUTDIR_DEFAULT = './'
+OUTFMT_DEFAULT = OutFmt.AUTO
+QUIET_DEFAULT = False
 
 app = typer.Typer(pretty_exceptions_show_locals=False)
 
 
+def error(msg: str) -> None:
+    print(msg, file=sys.stderr)
+    logging.error(msg)
+
+
 def write_line(file: TextIO, text: str = '') -> None:
     '''writes a line to STDOUT and file'''
     print(text)
@@ -109,57 +124,76 @@ def get_tables(org_queries: sh.queries.OrgTableQueries) -> Set[str]:
     return result
 
 
-def gen_filename(org: str, table: str, ext: str) -> str:
-    # <org>[-<table>].<ext>
-    return org + (f'-{table}' if table else '') + f'.{ext}'
+def gen_filename(in_name: str, org: str, table: str, ext: str) -> str:
+    if in_name == table or not table:
+        # this avoids duplicating the table name when both input and output is
+        # CSV
+        return f'{in_name}-{org}.{ext}'
+    else:
+        return f'{in_name}-{org}-{table}.{ext}'
 
 
-def get_debug_writer(debug: bool) -> TextIO:
+def get_debug_writer(debug: bool) -> Union[TextIO, contextlib.nullcontext]:
     # XXX: this function is only used for brewity with the below `with` clause
     if debug:
         return open('debug.txt', 'w')
     else:
         return contextlib.nullcontext()
 
 
-def get_excel_writer(debug: bool, org: str, outdir: str, outfmt: OutFmt
-                     ) -> Optional[pd.ExcelWriter]:
+def get_excel_writer(in_name, debug: bool, org: str, outdir: str,
+                     outfmt: OutFmt) -> Optional[pd.ExcelWriter]:
     if not debug and outfmt == OutFmt.EXCEL:
-        filename = gen_filename(org, '', 'xlsx')
-        print('writing ' + filename)
+        filename = gen_filename(in_name, org, '', 'xlsx')
+        logging.info('writing ' + filename)
         excel_path = os.path.join(outdir, filename)
         return pd.ExcelWriter(excel_path)
+    else:
+        return None
 
 
-@app.command()
-def main(
-    schema: str = typer.Argument(default=..., help=SCHEMA_DESC),
-    input: str = typer.Argument(default='', help=INPUT_DESC),
-    orgs: List[str] = typer.Option(default=[], help=ORGS_DESC),
-    outfmt: OutFmt = typer.Option(default=OutFmt.EXCEL, help=OUTFMT_DESC),
-    outdir: str = typer.Option(default='./', help=OUTDIR_DESC),
-    debug: Annotated[bool, typer.Option("-d", "--debug",
-                                        help=DEBUG_DESC)] = False,
+def infer_outfmt(path: str) -> Optional[OutFmt]:
+    '''returns None when not recognized'''
+    (_, ext) = os.path.splitext(path)
+    if ext == '.csv':
+        return OutFmt.CSV
+    elif ext == '.xlsx':
+        return OutFmt.EXCEL
+
+
+def share(
+    schema: str,
+    input: str,
+    orgs: List[str] = ORGS_DEFAULT,
+    outfmt: OutFmt = OUTFMT_DEFAULT,
+    outdir: str = OUTDIR_DEFAULT,
+    debug: bool = DEBUG_DEFAULT,
 ) -> None:
     schema_path = schema
-    filename = Path(schema_path).name
+    schema_filename = Path(schema_path).name
+    in_name = Path(input).stem
+
+    if outfmt == OutFmt.AUTO:
+        fmt = infer_outfmt(input)
+        if not fmt:
+            error('unable to infer output format from input file')
+            return
+        outfmt = fmt
 
-    print(f'loading schema {qt(filename)}')
+    logging.info(f'loading schema {qt(schema_filename)}')
     try:
         ruleset = rules.load(schema_path)
-        ruletree = trees.parse(ruleset, orgs, filename)
+        ruletree = trees.parse(ruleset, orgs, schema_filename)
         org_queries = queries.generate(ruletree)
         table_filter = get_tables(org_queries)
     except rules.ParseError:
         # XXX: error messages are already printed at this point
-        exit(1)
+        return
 
     # XXX: only tables found in the schema are considered in the data source
-    print(f'connecting to {qt(input)}')
+    logging.info(f'connecting to {qt(input)}')
     con = sh.connect(input, table_filter)
 
-    if debug:
-        print()
     # one debug file per run
     with get_debug_writer(debug) as debug_file:
         for org, table_queries in org_queries.items():
@@ -172,26 +206,50 @@ def main(
                     org_data[table] = sh.get_data(con, tq)
 
             # one excel file per org
-            excel_file = get_excel_writer(debug, org, outdir, outfmt)
+            excel_file = get_excel_writer(in_name, debug, org, outdir, outfmt)
             try:
                 for table, data in org_data.items():
                     if outfmt == OutFmt.CSV:
-                        filename = gen_filename(org, table, 'csv')
-                        print('writing ' + filename)
-                        data.to_csv(os.path.join(outdir, filename))
+                        filename = gen_filename(in_name, org, table, 'csv')
+                        logging.info('writing ' + filename)
+                        path = os.path.join(outdir, filename)
+                        data.to_csv(path, index=False)
                     elif outfmt == OutFmt.EXCEL:
-                        print(f'- {qt(table)}')
+                        logging.info(f'- {qt(table)}')
                         data.to_excel(excel_file, sheet_name=table)
                     else:
                         assert False, f'format {outfmt} not impl'
             except IndexError:
                 # XXX: this is thrown from excel writer when nothing is written
-                exit('failed to write output, most likely due to empty input')
+                error('failed to write output, most likely due to empty input')
+                return
             finally:
                 if excel_file:
                     excel_file.close()
-    print('done')
+    logging.info('done')
 
 
-if __name__ == '__main__':
+@app.command()
+def main_cli(
+    schema: str = typer.Argument(default=..., help=SCHEMA_DESC),
+    input: str = typer.Argument(default='', help=INPUT_DESC),
+    orgs: List[str] = typer.Option(default=ORGS_DEFAULT, help=ORGS_DESC),
+    outfmt: OutFmt = typer.Option(default=OUTFMT_DEFAULT, help=OUTFMT_DESC),
+    outdir: str = typer.Option(default=OUTDIR_DEFAULT, help=OUTDIR_DESC),
+    debug: Annotated[bool, typer.Option("-d", "--debug",
+                                        help=DEBUG_DESC)] = DEBUG_DEFAULT,
+    quiet: Annotated[bool, typer.Option("-q", "--quiet",
+                                        help=QUIET_DESC)] = QUIET_DEFAULT,
+) -> None:
+    if not quiet:
+        logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+    share(schema, input, orgs, outfmt, outdir, debug)
+
+
+def main():
+    # runs main_cli
     app()
+
+
+if __name__ == '__main__':
+    main()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`SQLAlchemy==2.0.29`
	`2`	`+numpy==1.24.4`
`2`	`3`	`openpyxl==3.1.2`
`3`	`4`	`pandas==2.0.3`
`4`	`5`	`pyfunctional==1.5.0`