Skip to content

Commit d6761af

Browse files
authored
Merge pull request #93 from casework/case_sparql_select_cross_json_and_anno_flag_testing
case_sparql_select: Cross-test JSON output and --no-(header,index) flag branches
2 parents 83ec9ac + ed44477 commit d6761af

File tree

41 files changed

+529
-70
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+529
-70
lines changed

case_utils/case_sparql_select/__init__.py

Lines changed: 225 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -49,74 +49,44 @@
4949
_logger = logging.getLogger(os.path.basename(__file__))
5050

5151

52-
def main() -> None:
53-
parser = argparse.ArgumentParser()
54-
55-
# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
56-
logging.basicConfig(
57-
level=logging.DEBUG
58-
if ("--debug" in sys.argv or "-d" in sys.argv)
59-
else logging.INFO
60-
)
61-
62-
parser.add_argument("-d", "--debug", action="store_true")
63-
parser.add_argument(
64-
"--built-version",
65-
choices=tuple(built_version_choices_list),
66-
default="case-" + CURRENT_CASE_VERSION,
67-
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
68-
)
69-
parser.add_argument(
70-
"--disallow-empty-results",
71-
action="store_true",
72-
help="Raise error if no results are returned for query.",
73-
)
74-
parser.add_argument(
75-
"--use-prefixes",
76-
action="store_true",
77-
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
78-
)
79-
parser.add_argument(
80-
"out_table",
81-
help="Expected extensions are .html for HTML tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values.",
82-
)
83-
parser.add_argument(
84-
"in_sparql",
85-
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
86-
)
87-
parser.add_argument("in_graph", nargs="+")
88-
args = parser.parse_args()
52+
def query_text_to_variables(select_query_text: str) -> typing.List[str]:
53+
# Build columns list from SELECT line.
54+
select_query_text_lines = select_query_text.split("\n")
55+
select_line = [
56+
line for line in select_query_text_lines if line.startswith("SELECT ")
57+
][0]
58+
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
59+
return variables
8960

90-
graph = rdflib.Graph()
91-
for in_graph_filename in args.in_graph:
92-
graph.parse(in_graph_filename)
9361

62+
def graph_and_query_to_data_frame(
63+
graph: rdflib.Graph,
64+
select_query_text: str,
65+
*args: typing.Any,
66+
built_version: str = "case-" + CURRENT_CASE_VERSION,
67+
disallow_empty_results: bool = False,
68+
use_prefixes: bool = False,
69+
**kwargs: typing.Any,
70+
) -> pd.DataFrame:
9471
# Inherit prefixes defined in input context dictionary.
9572
nsdict = {k: v for (k, v) in graph.namespace_manager.namespaces()}
9673

97-
select_query_text = None
98-
with open(args.in_sparql, "r") as in_fh:
99-
select_query_text = in_fh.read().strip()
100-
_logger.debug("select_query_text = %r." % select_query_text)
101-
74+
# Avoid side-effects on input parameter.
10275
if "subClassOf" in select_query_text:
103-
case_utils.ontology.load_subclass_hierarchy(
104-
graph, built_version=args.built_version
105-
)
76+
_graph = rdflib.Graph()
77+
_graph += graph
78+
case_utils.ontology.load_subclass_hierarchy(_graph, built_version=built_version)
79+
else:
80+
_graph = graph
10681

107-
# Build columns list from SELECT line.
108-
select_query_text_lines = select_query_text.split("\n")
109-
select_line = [
110-
line for line in select_query_text_lines if line.startswith("SELECT ")
111-
][0]
112-
variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
82+
variables = query_text_to_variables(select_query_text)
11383

11484
tally = 0
11585
records = []
11686
select_query_object = rdflib.plugins.sparql.processor.prepareQuery(
11787
select_query_text, initNs=nsdict
11888
)
119-
for (row_no, row) in enumerate(graph.query(select_query_object)):
89+
for (row_no, row) in enumerate(_graph.query(select_query_object)):
12090
tally = row_no + 1
12191
record = []
12292
for (column_no, column) in enumerate(row):
@@ -131,7 +101,7 @@ def main() -> None:
131101
# .decode() is because hexlify returns bytes.
132102
column_value = binascii.hexlify(column.toPython()).decode()
133103
elif isinstance(column, rdflib.URIRef):
134-
if args.use_prefixes:
104+
if use_prefixes:
135105
column_value = graph.namespace_manager.qname(column.toPython())
136106
else:
137107
column_value = column.toPython()
@@ -141,39 +111,225 @@ def main() -> None:
141111
_logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
142112
record.append(column_value)
143113
records.append(record)
114+
144115
if tally == 0:
145-
if args.disallow_empty_results:
116+
if disallow_empty_results:
146117
raise ValueError("Failed to return any results.")
147118

148119
df = pd.DataFrame(records, columns=variables)
120+
return df
149121

122+
123+
def data_frame_to_table_text(
124+
df: pd.DataFrame,
125+
*args: typing.Any,
126+
json_indent: typing.Optional[int] = None,
127+
json_orient: str,
128+
output_mode: str,
129+
use_header: bool,
130+
use_index: bool,
131+
**kwargs: typing.Any,
132+
) -> str:
150133
table_text: typing.Optional[str] = None
151-
if args.out_table.endswith(".csv") or args.out_table.endswith(".tsv"):
152-
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
134+
135+
# Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats.
136+
general_kwargs: typing.Dict[str, typing.Any] = dict()
137+
md_kwargs: typing.Dict[str, typing.Any] = dict()
138+
139+
# Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
140+
if use_header:
141+
general_kwargs["header"] = True
142+
else:
143+
general_kwargs["header"] = False
144+
md_kwargs["headers"] = tuple()
145+
146+
general_kwargs["index"] = use_index
147+
148+
if output_mode in {"csv", "tsv"}:
153149
sep: str
154-
if args.out_table.endswith(".csv"):
150+
if output_mode == "csv":
155151
sep = ","
156-
elif args.out_table.endswith(".tsv"):
152+
elif output_mode == "tsv":
157153
sep = "\t"
158154
else:
159155
raise NotImplementedError(
160156
"Output extension not implemented in CSV-style output."
161157
)
162-
table_text = df.to_csv(sep=sep)
163-
elif args.out_table.endswith(".html"):
158+
table_text = df.to_csv(sep=sep, **general_kwargs)
159+
elif output_mode == "html":
164160
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
165161
# Add CSS classes for CASE website Bootstrap support.
166-
table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
167-
elif args.out_table.endswith(".md"):
162+
table_text = df.to_html(
163+
classes=("table", "table-bordered", "table-condensed"), **general_kwargs
164+
)
165+
elif output_mode == "json":
166+
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
167+
168+
# Drop unsupported kwarg.
169+
del general_kwargs["header"]
170+
171+
table_text = df.to_json(
172+
indent=json_indent, orient=json_orient, date_format="iso", **general_kwargs
173+
)
174+
elif output_mode == "md":
168175
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
169176
# https://pypi.org/project/tabulate/
170177
# Assume Github-flavored Markdown.
171-
table_text = df.to_markdown(tablefmt="github")
172-
if table_text is None:
173-
raise NotImplementedError(
174-
"Unsupported output extension for output filename %r.", args.out_table
178+
179+
# Drop unsupported kwarg.
180+
del general_kwargs["header"]
181+
182+
table_text = df.to_markdown(tablefmt="github", **general_kwargs, **md_kwargs)
183+
else:
184+
if table_text is None:
185+
raise NotImplementedError("Unimplemented output mode: %r." % output_mode)
186+
assert table_text is not None
187+
188+
return table_text
189+
190+
191+
def main() -> None:
192+
parser = argparse.ArgumentParser()
193+
194+
# Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
195+
logging.basicConfig(
196+
level=logging.DEBUG
197+
if ("--debug" in sys.argv or "-d" in sys.argv)
198+
else logging.INFO
199+
)
200+
201+
parser.add_argument("-d", "--debug", action="store_true")
202+
parser.add_argument(
203+
"--built-version",
204+
choices=tuple(built_version_choices_list),
205+
default="case-" + CURRENT_CASE_VERSION,
206+
help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
207+
)
208+
parser.add_argument(
209+
"--disallow-empty-results",
210+
action="store_true",
211+
help="Raise error if no results are returned for query.",
212+
)
213+
parser.add_argument(
214+
"--json-indent",
215+
type=int,
216+
help="Number of whitespace characters to use for indentation. Only applicable for JSON output.",
217+
)
218+
parser.add_argument(
219+
"--json-orient",
220+
default="columns",
221+
choices=("columns", "index", "records", "split", "table", "values"),
222+
help="Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output.",
223+
)
224+
parser.add_argument(
225+
"--use-prefixes",
226+
action="store_true",
227+
help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)",
228+
)
229+
parser.add_argument(
230+
"out_table",
231+
help="Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD.",
232+
)
233+
parser.add_argument(
234+
"in_sparql",
235+
help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
236+
)
237+
238+
parser_header_group = parser.add_mutually_exclusive_group(required=False)
239+
parser_header_group.add_argument(
240+
"--header",
241+
action="store_true",
242+
help="Print column labels. This is the default behavior.",
243+
)
244+
parser_header_group.add_argument(
245+
"--no-header",
246+
action="store_true",
247+
help="Do not print column labels.",
248+
)
249+
250+
parser_index_group = parser.add_mutually_exclusive_group(required=False)
251+
parser_index_group.add_argument(
252+
"--index",
253+
action="store_true",
254+
help="Print index (auto-incrementing row labels as left untitled column). This is the default behavior.",
255+
)
256+
parser_index_group.add_argument(
257+
"--no-index",
258+
action="store_true",
259+
help="Do not print index. If output is JSON, --json-orient must be 'split' or 'table'.",
260+
)
261+
262+
parser.add_argument("in_graph", nargs="+")
263+
args = parser.parse_args()
264+
265+
output_mode: str
266+
if args.out_table.endswith(".csv"):
267+
output_mode = "csv"
268+
elif args.out_table.endswith(".html"):
269+
output_mode = "html"
270+
elif args.out_table.endswith(".json"):
271+
output_mode = "json"
272+
elif args.out_table.endswith(".md"):
273+
output_mode = "md"
274+
elif args.out_table.endswith(".tsv"):
275+
output_mode = "tsv"
276+
else:
277+
raise NotImplementedError("Output file extension not implemented.")
278+
279+
graph = rdflib.Graph()
280+
for in_graph_filename in args.in_graph:
281+
graph.parse(in_graph_filename)
282+
283+
select_query_text: typing.Optional[str] = None
284+
with open(args.in_sparql, "r") as in_fh:
285+
select_query_text = in_fh.read().strip()
286+
if select_query_text is None:
287+
raise ValueError("Failed to load query.")
288+
_logger.debug("select_query_text = %r." % select_query_text)
289+
290+
# Process --header and --no-header.
291+
use_header: bool
292+
if args.header is True:
293+
use_header = True
294+
if args.no_header is True:
295+
use_header = False
296+
else:
297+
use_header = True
298+
299+
# Process --index and --no-index.
300+
use_index: bool
301+
if args.index is True:
302+
use_index = True
303+
if args.no_index is True:
304+
use_index = False
305+
else:
306+
use_index = True
307+
308+
if (
309+
output_mode == "json"
310+
and use_index is False
311+
and args.json_orient not in {"split", "table"}
312+
):
313+
raise ValueError(
314+
"For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
175315
)
176316

317+
df = graph_and_query_to_data_frame(
318+
graph,
319+
select_query_text,
320+
built_version=args.built_version,
321+
disallow_empty_results=args.disallow_empty_results is True,
322+
use_prefixes=args.use_prefixes is True,
323+
)
324+
325+
table_text = data_frame_to_table_text(
326+
df,
327+
json_indent=args.json_indent,
328+
json_orient=args.json_orient,
329+
output_mode=output_mode,
330+
use_header=use_header,
331+
use_index=use_index,
332+
)
177333
with open(args.out_table, "w") as out_fh:
178334
out_fh.write(table_text)
179335
if table_text[-1] != "\n":

tests/case_utils/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ check: \
6565
&& pytest \
6666
--ignore case_file \
6767
--ignore case_sparql_construct \
68+
--ignore case_sparql_select \
6869
--ignore case_validate \
6970
--log-level=DEBUG
7071

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"?nFile":{
3+
"0":"kb:file-1",
4+
"1":"kb:file-2"
5+
}
6+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"?nFile":{"0":"kb:file-1","1":"kb:file-2"}}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"?name":{
3+
"0":"Johnny Lee Outlaw",
4+
"1":"Peter Goodguy"
5+
},
6+
"?mbox":{
7+
"0":"mailto:[email protected]",
8+
"1":"mailto:[email protected]"
9+
}
10+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"?name":{"0":"Johnny Lee Outlaw","1":"Peter Goodguy"},"?mbox":{"0":"mailto:[email protected]","1":"mailto:[email protected]"}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"0":{"?name":"Johnny Lee Outlaw","?mbox":"mailto:[email protected]"},"1":{"?name":"Peter Goodguy","?mbox":"mailto:[email protected]"}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"?name":"Johnny Lee Outlaw","?mbox":"mailto:[email protected]"},{"?name":"Peter Goodguy","?mbox":"mailto:[email protected]"}]
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"columns":["?name","?mbox"],"index":[0,1],"data":[["Johnny Lee Outlaw","mailto:[email protected]"],["Peter Goodguy","mailto:[email protected]"]]}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"?name","type":"string"},{"name":"?mbox","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"?name":"Johnny Lee Outlaw","?mbox":"mailto:[email protected]"},{"index":1,"?name":"Peter Goodguy","?mbox":"mailto:[email protected]"}]}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[["Johnny Lee Outlaw","mailto:[email protected]"],["Peter Goodguy","mailto:[email protected]"]]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
,?name,?mbox
2+
0,Johnny Lee Outlaw,mailto:[email protected]
3+
1,Peter Goodguy,mailto:[email protected]

0 commit comments

Comments
 (0)