Skip to content

Commit 102c9a0

Browse files
authored
Merge pull request #96 from casework/release-0.9.0
Release 0.9.0
2 parents 4123bff + be829b1 commit 102c9a0

File tree

61 files changed

+787
-73
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+787
-73
lines changed

.github/CODEOWNERS

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# This file lists the contributors responsible for the
2+
# repository content. They will also be automatically
3+
# asked to review any pull request made in this repository.
4+
5+
# Each line is a file pattern followed by one or more owners.
6+
# The sequence matters: later patterns take precedence.
7+
8+
# FILES OWNERS
9+
* @casework/maintainers-global
10+
* @casework/maintainers-case-python-utilities

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
repos:
22
- repo: https://github.com/psf/black
3-
rev: 22.3.0
3+
rev: 22.10.0
44
hooks:
55
- id: black
66
- repo: https://github.com/pycqa/flake8
7-
rev: 4.0.1
7+
rev: 5.0.4
88
hooks:
99
- id: flake8
1010
- repo: https://github.com/pycqa/isort

case_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@
1111
#
1212
# We would appreciate acknowledgement if the software is used.
1313

14-
__version__ = "0.8.0"
14+
__version__ = "0.9.0"
1515

1616
from . import local_uuid # noqa: F401

case_utils/case_sparql_select/__init__.py

Lines changed: 242 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@
2626
Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function.
2727
"""
2828

29-
__version__ = "0.4.4"
29+
__version__ = "0.5.0"
3030

3131
import argparse
3232
import binascii
3333
import logging
3434
import os
3535
import sys
36+
import typing
3637

3738
import pandas as pd # type: ignore
3839
import rdflib.plugins.sparql
@@ -48,69 +49,44 @@
4849
_logger = logging.getLogger(os.path.basename(__file__))
4950

5051

51-
def main() -> None:
52-
parser = argparse.ArgumentParser()
53-
54-
# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
55-
logging.basicConfig(
56-
level=logging.DEBUG
57-
if ("--debug" in sys.argv or "-d" in sys.argv)
58-
else logging.INFO
59-
)
60-
61-
parser.add_argument("-d", "--debug", action="store_true")
62-
parser.add_argument(
63-
"--built-version",
64-
choices=tuple(built_version_choices_list),
65-
default="case-" + CURRENT_CASE_VERSION,
66-
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
67-
)
68-
parser.add_argument(
69-
"--disallow-empty-results",
70-
action="store_true",
71-
help="Raise error if no results are returned for query.",
72-
)
73-
parser.add_argument(
74-
"out_table",
75-
help="Expected extensions are .html for HTML tables or .md for Markdown tables.",
76-
)
77-
parser.add_argument(
78-
"in_sparql",
79-
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
80-
)
81-
parser.add_argument("in_graph", nargs="+")
82-
args = parser.parse_args()
52+
def query_text_to_variables(select_query_text: str) -> typing.List[str]:
53+
# Build columns list from SELECT line.
54+
select_query_text_lines = select_query_text.split("\n")
55+
select_line = [
56+
line for line in select_query_text_lines if line.startswith("SELECT ")
57+
][0]
58+
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
59+
return variables
8360

84-
graph = rdflib.Graph()
85-
for in_graph_filename in args.in_graph:
86-
graph.parse(in_graph_filename)
8761

62+
def graph_and_query_to_data_frame(
63+
graph: rdflib.Graph,
64+
select_query_text: str,
65+
*args: typing.Any,
66+
built_version: str = "case-" + CURRENT_CASE_VERSION,
67+
disallow_empty_results: bool = False,
68+
use_prefixes: bool = False,
69+
**kwargs: typing.Any,
70+
) -> pd.DataFrame:
8871
# Inherit prefixes defined in input context dictionary.
8972
nsdict = {k: v for (k, v) in graph.namespace_manager.namespaces()}
9073

91-
select_query_text = None
92-
with open(args.in_sparql, "r") as in_fh:
93-
select_query_text = in_fh.read().strip()
94-
_logger.debug("select_query_text = %r." % select_query_text)
95-
74+
# Avoid side-effects on input parameter.
9675
if "subClassOf" in select_query_text:
97-
case_utils.ontology.load_subclass_hierarchy(
98-
graph, built_version=args.built_version
99-
)
76+
_graph = rdflib.Graph()
77+
_graph += graph
78+
case_utils.ontology.load_subclass_hierarchy(_graph, built_version=built_version)
79+
else:
80+
_graph = graph
10081

101-
# Build columns list from SELECT line.
102-
select_query_text_lines = select_query_text.split("\n")
103-
select_line = [
104-
line for line in select_query_text_lines if line.startswith("SELECT ")
105-
][0]
106-
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
82+
variables = query_text_to_variables(select_query_text)
10783

10884
tally = 0
10985
records = []
11086
select_query_object = rdflib.plugins.sparql.processor.prepareQuery(
11187
select_query_text, initNs=nsdict
11288
)
113-
for (row_no, row) in enumerate(graph.query(select_query_object)):
89+
for (row_no, row) in enumerate(_graph.query(select_query_object)):
11490
tally = row_no + 1
11591
record = []
11692
for (column_no, column) in enumerate(row):
@@ -124,35 +100,241 @@ def main() -> None:
124100
# The render to ASCII is in support of this script rendering results for website viewing.
125101
# .decode() is because hexlify returns bytes.
126102
column_value = binascii.hexlify(column.toPython()).decode()
103+
elif isinstance(column, rdflib.URIRef):
104+
if use_prefixes:
105+
column_value = graph.namespace_manager.qname(column.toPython())
106+
else:
107+
column_value = column.toPython()
127108
else:
128109
column_value = column.toPython()
129110
if row_no == 0:
130111
_logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
131112
record.append(column_value)
132113
records.append(record)
114+
133115
if tally == 0:
134-
if args.disallow_empty_results:
116+
if disallow_empty_results:
135117
raise ValueError("Failed to return any results.")
136118

137119
df = pd.DataFrame(records, columns=variables)
120+
return df
121+
122+
123+
def data_frame_to_table_text(
124+
df: pd.DataFrame,
125+
*args: typing.Any,
126+
json_indent: typing.Optional[int] = None,
127+
json_orient: str,
128+
output_mode: str,
129+
use_header: bool,
130+
use_index: bool,
131+
**kwargs: typing.Any,
132+
) -> str:
133+
table_text: typing.Optional[str] = None
138134

139-
table_text = None
140-
if args.out_table.endswith(".html"):
135+
# Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats.
136+
general_kwargs: typing.Dict[str, typing.Any] = dict()
137+
md_kwargs: typing.Dict[str, typing.Any] = dict()
138+
139+
# Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
140+
if use_header:
141+
general_kwargs["header"] = True
142+
else:
143+
general_kwargs["header"] = False
144+
md_kwargs["headers"] = tuple()
145+
146+
general_kwargs["index"] = use_index
147+
148+
if output_mode in {"csv", "tsv"}:
149+
sep: str
150+
if output_mode == "csv":
151+
sep = ","
152+
elif output_mode == "tsv":
153+
sep = "\t"
154+
else:
155+
raise NotImplementedError(
156+
"Output extension not implemented in CSV-style output."
157+
)
158+
table_text = df.to_csv(sep=sep, **general_kwargs)
159+
elif output_mode == "html":
141160
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
142161
# Add CSS classes for CASE website Bootstrap support.
143-
table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
144-
elif args.out_table.endswith(".md"):
162+
table_text = df.to_html(
163+
classes=("table", "table-bordered", "table-condensed"), **general_kwargs
164+
)
165+
elif output_mode == "json":
166+
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
167+
168+
# Drop unsupported kwarg.
169+
del general_kwargs["header"]
170+
171+
table_text = df.to_json(
172+
indent=json_indent, orient=json_orient, date_format="iso", **general_kwargs
173+
)
174+
elif output_mode == "md":
145175
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
146176
# https://pypi.org/project/tabulate/
147177
# Assume Github-flavored Markdown.
148-
table_text = df.to_markdown(tablefmt="github")
149-
if table_text is None:
150-
raise NotImplementedError(
151-
"Unsupported output extension for output filename %r.", args.out_table
178+
179+
# Drop unsupported kwarg.
180+
del general_kwargs["header"]
181+
182+
table_text = df.to_markdown(tablefmt="github", **general_kwargs, **md_kwargs)
183+
else:
184+
if table_text is None:
185+
raise NotImplementedError("Unimplemented output mode: %r." % output_mode)
186+
assert table_text is not None
187+
188+
return table_text
189+
190+
191+
def main() -> None:
192+
parser = argparse.ArgumentParser()
193+
194+
# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
195+
logging.basicConfig(
196+
level=logging.DEBUG
197+
if ("--debug" in sys.argv or "-d" in sys.argv)
198+
else logging.INFO
199+
)
200+
201+
parser.add_argument("-d", "--debug", action="store_true")
202+
parser.add_argument(
203+
"--built-version",
204+
choices=tuple(built_version_choices_list),
205+
default="case-" + CURRENT_CASE_VERSION,
206+
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
207+
)
208+
parser.add_argument(
209+
"--disallow-empty-results",
210+
action="store_true",
211+
help="Raise error if no results are returned for query.",
212+
)
213+
parser.add_argument(
214+
"--json-indent",
215+
type=int,
216+
help="Number of whitespace characters to use for indentation. Only applicable for JSON output.",
217+
)
218+
parser.add_argument(
219+
"--json-orient",
220+
default="columns",
221+
choices=("columns", "index", "records", "split", "table", "values"),
222+
help="Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output.",
223+
)
224+
parser.add_argument(
225+
"--use-prefixes",
226+
action="store_true",
227+
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
228+
)
229+
parser.add_argument(
230+
"out_table",
231+
help="Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD.",
232+
)
233+
parser.add_argument(
234+
"in_sparql",
235+
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
236+
)
237+
238+
parser_header_group = parser.add_mutually_exclusive_group(required=False)
239+
parser_header_group.add_argument(
240+
"--header",
241+
action="store_true",
242+
help="Print column labels. This is the default behavior.",
243+
)
244+
parser_header_group.add_argument(
245+
"--no-header",
246+
action="store_true",
247+
help="Do not print column labels.",
248+
)
249+
250+
parser_index_group = parser.add_mutually_exclusive_group(required=False)
251+
parser_index_group.add_argument(
252+
"--index",
253+
action="store_true",
254+
help="Print index (auto-incrementing row labels as left untitled column). This is the default behavior.",
255+
)
256+
parser_index_group.add_argument(
257+
"--no-index",
258+
action="store_true",
259+
help="Do not print index. If output is JSON, --json-orient must be 'split' or 'table'.",
260+
)
261+
262+
parser.add_argument("in_graph", nargs="+")
263+
args = parser.parse_args()
264+
265+
output_mode: str
266+
if args.out_table.endswith(".csv"):
267+
output_mode = "csv"
268+
elif args.out_table.endswith(".html"):
269+
output_mode = "html"
270+
elif args.out_table.endswith(".json"):
271+
output_mode = "json"
272+
elif args.out_table.endswith(".md"):
273+
output_mode = "md"
274+
elif args.out_table.endswith(".tsv"):
275+
output_mode = "tsv"
276+
else:
277+
raise NotImplementedError("Output file extension not implemented.")
278+
279+
graph = rdflib.Graph()
280+
for in_graph_filename in args.in_graph:
281+
graph.parse(in_graph_filename)
282+
283+
select_query_text: typing.Optional[str] = None
284+
with open(args.in_sparql, "r") as in_fh:
285+
select_query_text = in_fh.read().strip()
286+
if select_query_text is None:
287+
raise ValueError("Failed to load query.")
288+
_logger.debug("select_query_text = %r." % select_query_text)
289+
290+
# Process --header and --no-header.
291+
use_header: bool
292+
if args.header is True:
293+
use_header = True
294+
if args.no_header is True:
295+
use_header = False
296+
else:
297+
use_header = True
298+
299+
# Process --index and --no-index.
300+
use_index: bool
301+
if args.index is True:
302+
use_index = True
303+
if args.no_index is True:
304+
use_index = False
305+
else:
306+
use_index = True
307+
308+
if (
309+
output_mode == "json"
310+
and use_index is False
311+
and args.json_orient not in {"split", "table"}
312+
):
313+
raise ValueError(
314+
"For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
152315
)
153316

317+
df = graph_and_query_to_data_frame(
318+
graph,
319+
select_query_text,
320+
built_version=args.built_version,
321+
disallow_empty_results=args.disallow_empty_results is True,
322+
use_prefixes=args.use_prefixes is True,
323+
)
324+
325+
table_text = data_frame_to_table_text(
326+
df,
327+
json_indent=args.json_indent,
328+
json_orient=args.json_orient,
329+
output_mode=output_mode,
330+
use_header=use_header,
331+
use_index=use_index,
332+
)
154333
with open(args.out_table, "w") as out_fh:
155334
out_fh.write(table_text)
335+
if table_text[-1] != "\n":
336+
# End file with newline. CSV and TSV modes end with a built-in newline.
337+
out_fh.write("\n")
156338

157339

158340
if __name__ == "__main__":

0 commit comments

Comments
 (0)