26
26
Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function.
27
27
"""
28
28
29
- __version__ = "0.4.4 "
29
+ __version__ = "0.5.0 "
30
30
31
31
import argparse
32
32
import binascii
33
33
import logging
34
34
import os
35
35
import sys
36
+ import typing
36
37
37
38
import pandas as pd # type: ignore
38
39
import rdflib .plugins .sparql
48
49
_logger = logging .getLogger (os .path .basename (__file__ ))
49
50
50
51
51
- def main () -> None :
52
- parser = argparse .ArgumentParser ()
53
-
54
- # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
55
- logging .basicConfig (
56
- level = logging .DEBUG
57
- if ("--debug" in sys .argv or "-d" in sys .argv )
58
- else logging .INFO
59
- )
60
-
61
- parser .add_argument ("-d" , "--debug" , action = "store_true" )
62
- parser .add_argument (
63
- "--built-version" ,
64
- choices = tuple (built_version_choices_list ),
65
- default = "case-" + CURRENT_CASE_VERSION ,
66
- help = "Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis." ,
67
- )
68
- parser .add_argument (
69
- "--disallow-empty-results" ,
70
- action = "store_true" ,
71
- help = "Raise error if no results are returned for query." ,
72
- )
73
- parser .add_argument (
74
- "out_table" ,
75
- help = "Expected extensions are .html for HTML tables or .md for Markdown tables." ,
76
- )
77
- parser .add_argument (
78
- "in_sparql" ,
79
- help = "File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs." ,
80
- )
81
- parser .add_argument ("in_graph" , nargs = "+" )
82
- args = parser .parse_args ()
52
+ def query_text_to_variables (select_query_text : str ) -> typing .List [str ]:
53
+ # Build columns list from SELECT line.
54
+ select_query_text_lines = select_query_text .split ("\n " )
55
+ select_line = [
56
+ line for line in select_query_text_lines if line .startswith ("SELECT " )
57
+ ][0 ]
58
+ variables = select_line .replace (" DISTINCT" , "" ).replace ("SELECT " , "" ).split (" " )
59
+ return variables
83
60
84
- graph = rdflib .Graph ()
85
- for in_graph_filename in args .in_graph :
86
- graph .parse (in_graph_filename )
87
61
62
+ def graph_and_query_to_data_frame (
63
+ graph : rdflib .Graph ,
64
+ select_query_text : str ,
65
+ * args : typing .Any ,
66
+ built_version : str = "case-" + CURRENT_CASE_VERSION ,
67
+ disallow_empty_results : bool = False ,
68
+ use_prefixes : bool = False ,
69
+ ** kwargs : typing .Any ,
70
+ ) -> pd .DataFrame :
88
71
# Inherit prefixes defined in input context dictionary.
89
72
nsdict = {k : v for (k , v ) in graph .namespace_manager .namespaces ()}
90
73
91
- select_query_text = None
92
- with open (args .in_sparql , "r" ) as in_fh :
93
- select_query_text = in_fh .read ().strip ()
94
- _logger .debug ("select_query_text = %r." % select_query_text )
95
-
74
+ # Avoid side-effects on input parameter.
96
75
if "subClassOf" in select_query_text :
97
- case_utils .ontology .load_subclass_hierarchy (
98
- graph , built_version = args .built_version
99
- )
76
+ _graph = rdflib .Graph ()
77
+ _graph += graph
78
+ case_utils .ontology .load_subclass_hierarchy (_graph , built_version = built_version )
79
+ else :
80
+ _graph = graph
100
81
101
- # Build columns list from SELECT line.
102
- select_query_text_lines = select_query_text .split ("\n " )
103
- select_line = [
104
- line for line in select_query_text_lines if line .startswith ("SELECT " )
105
- ][0 ]
106
- variables = select_line .replace (" DISTINCT" , "" ).replace ("SELECT " , "" ).split (" " )
82
+ variables = query_text_to_variables (select_query_text )
107
83
108
84
tally = 0
109
85
records = []
110
86
select_query_object = rdflib .plugins .sparql .processor .prepareQuery (
111
87
select_query_text , initNs = nsdict
112
88
)
113
- for (row_no , row ) in enumerate (graph .query (select_query_object )):
89
+ for (row_no , row ) in enumerate (_graph .query (select_query_object )):
114
90
tally = row_no + 1
115
91
record = []
116
92
for (column_no , column ) in enumerate (row ):
@@ -124,35 +100,241 @@ def main() -> None:
124
100
# The render to ASCII is in support of this script rendering results for website viewing.
125
101
# .decode() is because hexlify returns bytes.
126
102
column_value = binascii .hexlify (column .toPython ()).decode ()
103
+ elif isinstance (column , rdflib .URIRef ):
104
+ if use_prefixes :
105
+ column_value = graph .namespace_manager .qname (column .toPython ())
106
+ else :
107
+ column_value = column .toPython ()
127
108
else :
128
109
column_value = column .toPython ()
129
110
if row_no == 0 :
130
111
_logger .debug ("row[0]column[%d] = %r." % (column_no , column_value ))
131
112
record .append (column_value )
132
113
records .append (record )
114
+
133
115
if tally == 0 :
134
- if args . disallow_empty_results :
116
+ if disallow_empty_results :
135
117
raise ValueError ("Failed to return any results." )
136
118
137
119
df = pd .DataFrame (records , columns = variables )
120
+ return df
121
+
122
+
123
+ def data_frame_to_table_text (
124
+ df : pd .DataFrame ,
125
+ * args : typing .Any ,
126
+ json_indent : typing .Optional [int ] = None ,
127
+ json_orient : str ,
128
+ output_mode : str ,
129
+ use_header : bool ,
130
+ use_index : bool ,
131
+ ** kwargs : typing .Any ,
132
+ ) -> str :
133
+ table_text : typing .Optional [str ] = None
138
134
139
- table_text = None
140
- if args .out_table .endswith (".html" ):
135
+ # Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats.
136
+ general_kwargs : typing .Dict [str , typing .Any ] = dict ()
137
+ md_kwargs : typing .Dict [str , typing .Any ] = dict ()
138
+
139
+ # Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
140
+ if use_header :
141
+ general_kwargs ["header" ] = True
142
+ else :
143
+ general_kwargs ["header" ] = False
144
+ md_kwargs ["headers" ] = tuple ()
145
+
146
+ general_kwargs ["index" ] = use_index
147
+
148
+ if output_mode in {"csv" , "tsv" }:
149
+ sep : str
150
+ if output_mode == "csv" :
151
+ sep = ","
152
+ elif output_mode == "tsv" :
153
+ sep = "\t "
154
+ else :
155
+ raise NotImplementedError (
156
+ "Output extension not implemented in CSV-style output."
157
+ )
158
+ table_text = df .to_csv (sep = sep , ** general_kwargs )
159
+ elif output_mode == "html" :
141
160
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
142
161
# Add CSS classes for CASE website Bootstrap support.
143
- table_text = df .to_html (classes = ("table" , "table-bordered" , "table-condensed" ))
144
- elif args .out_table .endswith (".md" ):
162
+ table_text = df .to_html (
163
+ classes = ("table" , "table-bordered" , "table-condensed" ), ** general_kwargs
164
+ )
165
+ elif output_mode == "json" :
166
+ # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
167
+
168
+ # Drop unsupported kwarg.
169
+ del general_kwargs ["header" ]
170
+
171
+ table_text = df .to_json (
172
+ indent = json_indent , orient = json_orient , date_format = "iso" , ** general_kwargs
173
+ )
174
+ elif output_mode == "md" :
145
175
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
146
176
# https://pypi.org/project/tabulate/
147
177
# Assume Github-flavored Markdown.
148
- table_text = df .to_markdown (tablefmt = "github" )
149
- if table_text is None :
150
- raise NotImplementedError (
151
- "Unsupported output extension for output filename %r." , args .out_table
178
+
179
+ # Drop unsupported kwarg.
180
+ del general_kwargs ["header" ]
181
+
182
+ table_text = df .to_markdown (tablefmt = "github" , ** general_kwargs , ** md_kwargs )
183
+ else :
184
+ if table_text is None :
185
+ raise NotImplementedError ("Unimplemented output mode: %r." % output_mode )
186
+ assert table_text is not None
187
+
188
+ return table_text
189
+
190
+
191
+ def main () -> None :
192
+ parser = argparse .ArgumentParser ()
193
+
194
+ # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
195
+ logging .basicConfig (
196
+ level = logging .DEBUG
197
+ if ("--debug" in sys .argv or "-d" in sys .argv )
198
+ else logging .INFO
199
+ )
200
+
201
+ parser .add_argument ("-d" , "--debug" , action = "store_true" )
202
+ parser .add_argument (
203
+ "--built-version" ,
204
+ choices = tuple (built_version_choices_list ),
205
+ default = "case-" + CURRENT_CASE_VERSION ,
206
+ help = "Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis." ,
207
+ )
208
+ parser .add_argument (
209
+ "--disallow-empty-results" ,
210
+ action = "store_true" ,
211
+ help = "Raise error if no results are returned for query." ,
212
+ )
213
+ parser .add_argument (
214
+ "--json-indent" ,
215
+ type = int ,
216
+ help = "Number of whitespace characters to use for indentation. Only applicable for JSON output." ,
217
+ )
218
+ parser .add_argument (
219
+ "--json-orient" ,
220
+ default = "columns" ,
221
+ choices = ("columns" , "index" , "records" , "split" , "table" , "values" ),
222
+ help = "Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output." ,
223
+ )
224
+ parser .add_argument (
225
+ "--use-prefixes" ,
226
+ action = "store_true" ,
227
+ help = "Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)" ,
228
+ )
229
+ parser .add_argument (
230
+ "out_table" ,
231
+ help = "Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD." ,
232
+ )
233
+ parser .add_argument (
234
+ "in_sparql" ,
235
+ help = "File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs." ,
236
+ )
237
+
238
+ parser_header_group = parser .add_mutually_exclusive_group (required = False )
239
+ parser_header_group .add_argument (
240
+ "--header" ,
241
+ action = "store_true" ,
242
+ help = "Print column labels. This is the default behavior." ,
243
+ )
244
+ parser_header_group .add_argument (
245
+ "--no-header" ,
246
+ action = "store_true" ,
247
+ help = "Do not print column labels." ,
248
+ )
249
+
250
+ parser_index_group = parser .add_mutually_exclusive_group (required = False )
251
+ parser_index_group .add_argument (
252
+ "--index" ,
253
+ action = "store_true" ,
254
+ help = "Print index (auto-incrementing row labels as left untitled column). This is the default behavior." ,
255
+ )
256
+ parser_index_group .add_argument (
257
+ "--no-index" ,
258
+ action = "store_true" ,
259
+ help = "Do not print index. If output is JSON, --json-orient must be 'split' or 'table'." ,
260
+ )
261
+
262
+ parser .add_argument ("in_graph" , nargs = "+" )
263
+ args = parser .parse_args ()
264
+
265
+ output_mode : str
266
+ if args .out_table .endswith (".csv" ):
267
+ output_mode = "csv"
268
+ elif args .out_table .endswith (".html" ):
269
+ output_mode = "html"
270
+ elif args .out_table .endswith (".json" ):
271
+ output_mode = "json"
272
+ elif args .out_table .endswith (".md" ):
273
+ output_mode = "md"
274
+ elif args .out_table .endswith (".tsv" ):
275
+ output_mode = "tsv"
276
+ else :
277
+ raise NotImplementedError ("Output file extension not implemented." )
278
+
279
+ graph = rdflib .Graph ()
280
+ for in_graph_filename in args .in_graph :
281
+ graph .parse (in_graph_filename )
282
+
283
+ select_query_text : typing .Optional [str ] = None
284
+ with open (args .in_sparql , "r" ) as in_fh :
285
+ select_query_text = in_fh .read ().strip ()
286
+ if select_query_text is None :
287
+ raise ValueError ("Failed to load query." )
288
+ _logger .debug ("select_query_text = %r." % select_query_text )
289
+
290
+ # Process --header and --no-header.
291
+ use_header : bool
292
+ if args .header is True :
293
+ use_header = True
294
+ if args .no_header is True :
295
+ use_header = False
296
+ else :
297
+ use_header = True
298
+
299
+ # Process --index and --no-index.
300
+ use_index : bool
301
+ if args .index is True :
302
+ use_index = True
303
+ if args .no_index is True :
304
+ use_index = False
305
+ else :
306
+ use_index = True
307
+
308
+ if (
309
+ output_mode == "json"
310
+ and use_index is False
311
+ and args .json_orient not in {"split" , "table" }
312
+ ):
313
+ raise ValueError (
314
+ "For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
152
315
)
153
316
317
+ df = graph_and_query_to_data_frame (
318
+ graph ,
319
+ select_query_text ,
320
+ built_version = args .built_version ,
321
+ disallow_empty_results = args .disallow_empty_results is True ,
322
+ use_prefixes = args .use_prefixes is True ,
323
+ )
324
+
325
+ table_text = data_frame_to_table_text (
326
+ df ,
327
+ json_indent = args .json_indent ,
328
+ json_orient = args .json_orient ,
329
+ output_mode = output_mode ,
330
+ use_header = use_header ,
331
+ use_index = use_index ,
332
+ )
154
333
with open (args .out_table , "w" ) as out_fh :
155
334
out_fh .write (table_text )
335
+ if table_text [- 1 ] != "\n " :
336
+ # End file with newline. CSV and TSV modes end with a built-in newline.
337
+ out_fh .write ("\n " )
156
338
157
339
158
340
if __name__ == "__main__" :
0 commit comments