49
49
_logger = logging .getLogger (os .path .basename (__file__ ))
50
50
51
51
52
- def main () -> None :
53
- parser = argparse .ArgumentParser ()
54
-
55
- # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
56
- logging .basicConfig (
57
- level = logging .DEBUG
58
- if ("--debug" in sys .argv or "-d" in sys .argv )
59
- else logging .INFO
60
- )
61
-
62
- parser .add_argument ("-d" , "--debug" , action = "store_true" )
63
- parser .add_argument (
64
- "--built-version" ,
65
- choices = tuple (built_version_choices_list ),
66
- default = "case-" + CURRENT_CASE_VERSION ,
67
- help = "Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis." ,
68
- )
69
- parser .add_argument (
70
- "--disallow-empty-results" ,
71
- action = "store_true" ,
72
- help = "Raise error if no results are returned for query." ,
73
- )
74
- parser .add_argument (
75
- "--use-prefixes" ,
76
- action = "store_true" ,
77
- help = "Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)" ,
78
- )
79
- parser .add_argument (
80
- "out_table" ,
81
- help = "Expected extensions are .html for HTML tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values." ,
82
- )
83
- parser .add_argument (
84
- "in_sparql" ,
85
- help = "File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs." ,
86
- )
87
- parser .add_argument ("in_graph" , nargs = "+" )
88
- args = parser .parse_args ()
52
+ def query_text_to_variables (select_query_text : str ) -> typing .List [str ]:
53
+ # Build columns list from SELECT line.
54
+ select_query_text_lines = select_query_text .split ("\n " )
55
+ select_line = [
56
+ line for line in select_query_text_lines if line .startswith ("SELECT " )
57
+ ][0 ]
58
+ variables = select_line .replace (" DISTINCT" , "" ).replace ("SELECT " , "" ).split (" " )
59
+ return variables
89
60
90
- graph = rdflib .Graph ()
91
- for in_graph_filename in args .in_graph :
92
- graph .parse (in_graph_filename )
93
61
62
+ def graph_and_query_to_data_frame (
63
+ graph : rdflib .Graph ,
64
+ select_query_text : str ,
65
+ * args : typing .Any ,
66
+ built_version : str = "case-" + CURRENT_CASE_VERSION ,
67
+ disallow_empty_results : bool = False ,
68
+ use_prefixes : bool = False ,
69
+ ** kwargs : typing .Any ,
70
+ ) -> pd .DataFrame :
94
71
# Inherit prefixes defined in input context dictionary.
95
72
nsdict = {k : v for (k , v ) in graph .namespace_manager .namespaces ()}
96
73
97
- select_query_text = None
98
- with open (args .in_sparql , "r" ) as in_fh :
99
- select_query_text = in_fh .read ().strip ()
100
- _logger .debug ("select_query_text = %r." % select_query_text )
101
-
74
+ # Avoid side-effects on input parameter.
102
75
if "subClassOf" in select_query_text :
103
- case_utils .ontology .load_subclass_hierarchy (
104
- graph , built_version = args .built_version
105
- )
76
+ _graph = rdflib .Graph ()
77
+ _graph += graph
78
+ case_utils .ontology .load_subclass_hierarchy (_graph , built_version = built_version )
79
+ else :
80
+ _graph = graph
106
81
107
- # Build columns list from SELECT line.
108
- select_query_text_lines = select_query_text .split ("\n " )
109
- select_line = [
110
- line for line in select_query_text_lines if line .startswith ("SELECT " )
111
- ][0 ]
112
- variables = select_line .replace (" DISTINCT" , "" ).replace ("SELECT " , "" ).split (" " )
82
+ variables = query_text_to_variables (select_query_text )
113
83
114
84
tally = 0
115
85
records = []
116
86
select_query_object = rdflib .plugins .sparql .processor .prepareQuery (
117
87
select_query_text , initNs = nsdict
118
88
)
119
- for (row_no , row ) in enumerate (graph .query (select_query_object )):
89
+ for (row_no , row ) in enumerate (_graph .query (select_query_object )):
120
90
tally = row_no + 1
121
91
record = []
122
92
for (column_no , column ) in enumerate (row ):
@@ -131,7 +101,7 @@ def main() -> None:
131
101
# .decode() is because hexlify returns bytes.
132
102
column_value = binascii .hexlify (column .toPython ()).decode ()
133
103
elif isinstance (column , rdflib .URIRef ):
134
- if args . use_prefixes :
104
+ if use_prefixes :
135
105
column_value = graph .namespace_manager .qname (column .toPython ())
136
106
else :
137
107
column_value = column .toPython ()
@@ -141,39 +111,225 @@ def main() -> None:
141
111
_logger .debug ("row[0]column[%d] = %r." % (column_no , column_value ))
142
112
record .append (column_value )
143
113
records .append (record )
114
+
144
115
if tally == 0 :
145
- if args . disallow_empty_results :
116
+ if disallow_empty_results :
146
117
raise ValueError ("Failed to return any results." )
147
118
148
119
df = pd .DataFrame (records , columns = variables )
120
+ return df
149
121
122
+
123
+ def data_frame_to_table_text (
124
+ df : pd .DataFrame ,
125
+ * args : typing .Any ,
126
+ json_indent : typing .Optional [int ] = None ,
127
+ json_orient : str ,
128
+ output_mode : str ,
129
+ use_header : bool ,
130
+ use_index : bool ,
131
+ ** kwargs : typing .Any ,
132
+ ) -> str :
150
133
table_text : typing .Optional [str ] = None
151
- if args .out_table .endswith (".csv" ) or args .out_table .endswith (".tsv" ):
152
- # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
134
+
135
+ # Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats.
136
+ general_kwargs : typing .Dict [str , typing .Any ] = dict ()
137
+ md_kwargs : typing .Dict [str , typing .Any ] = dict ()
138
+
139
+ # Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
140
+ if use_header :
141
+ general_kwargs ["header" ] = True
142
+ else :
143
+ general_kwargs ["header" ] = False
144
+ md_kwargs ["headers" ] = tuple ()
145
+
146
+ general_kwargs ["index" ] = use_index
147
+
148
+ if output_mode in {"csv" , "tsv" }:
153
149
sep : str
154
- if args . out_table . endswith ( ". csv") :
150
+ if output_mode == " csv" :
155
151
sep = ","
156
- elif args . out_table . endswith ( ". tsv") :
152
+ elif output_mode == " tsv" :
157
153
sep = "\t "
158
154
else :
159
155
raise NotImplementedError (
160
156
"Output extension not implemented in CSV-style output."
161
157
)
162
- table_text = df .to_csv (sep = sep )
163
- elif args . out_table . endswith ( ". html") :
158
+ table_text = df .to_csv (sep = sep , ** general_kwargs )
159
+ elif output_mode == " html" :
164
160
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
165
161
# Add CSS classes for CASE website Bootstrap support.
166
- table_text = df .to_html (classes = ("table" , "table-bordered" , "table-condensed" ))
167
- elif args .out_table .endswith (".md" ):
162
+ table_text = df .to_html (
163
+ classes = ("table" , "table-bordered" , "table-condensed" ), ** general_kwargs
164
+ )
165
+ elif output_mode == "json" :
166
+ # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
167
+
168
+ # Drop unsupported kwarg.
169
+ del general_kwargs ["header" ]
170
+
171
+ table_text = df .to_json (
172
+ indent = json_indent , orient = json_orient , date_format = "iso" , ** general_kwargs
173
+ )
174
+ elif output_mode == "md" :
168
175
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
169
176
# https://pypi.org/project/tabulate/
170
177
# Assume Github-flavored Markdown.
171
- table_text = df .to_markdown (tablefmt = "github" )
172
- if table_text is None :
173
- raise NotImplementedError (
174
- "Unsupported output extension for output filename %r." , args .out_table
178
+
179
+ # Drop unsupported kwarg.
180
+ del general_kwargs ["header" ]
181
+
182
+ table_text = df .to_markdown (tablefmt = "github" , ** general_kwargs , ** md_kwargs )
183
+ else :
184
+ if table_text is None :
185
+ raise NotImplementedError ("Unimplemented output mode: %r." % output_mode )
186
+ assert table_text is not None
187
+
188
+ return table_text
189
+
190
+
191
+ def main () -> None :
192
+ parser = argparse .ArgumentParser ()
193
+
194
+ # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
195
+ logging .basicConfig (
196
+ level = logging .DEBUG
197
+ if ("--debug" in sys .argv or "-d" in sys .argv )
198
+ else logging .INFO
199
+ )
200
+
201
+ parser .add_argument ("-d" , "--debug" , action = "store_true" )
202
+ parser .add_argument (
203
+ "--built-version" ,
204
+ choices = tuple (built_version_choices_list ),
205
+ default = "case-" + CURRENT_CASE_VERSION ,
206
+ help = "Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis." ,
207
+ )
208
+ parser .add_argument (
209
+ "--disallow-empty-results" ,
210
+ action = "store_true" ,
211
+ help = "Raise error if no results are returned for query." ,
212
+ )
213
+ parser .add_argument (
214
+ "--json-indent" ,
215
+ type = int ,
216
+ help = "Number of whitespace characters to use for indentation. Only applicable for JSON output." ,
217
+ )
218
+ parser .add_argument (
219
+ "--json-orient" ,
220
+ default = "columns" ,
221
+ choices = ("columns" , "index" , "records" , "split" , "table" , "values" ),
222
+ help = "Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output." ,
223
+ )
224
+ parser .add_argument (
225
+ "--use-prefixes" ,
226
+ action = "store_true" ,
227
+ help = "Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)" ,
228
+ )
229
+ parser .add_argument (
230
+ "out_table" ,
231
+ help = "Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD." ,
232
+ )
233
+ parser .add_argument (
234
+ "in_sparql" ,
235
+ help = "File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs." ,
236
+ )
237
+
238
+ parser_header_group = parser .add_mutually_exclusive_group (required = False )
239
+ parser_header_group .add_argument (
240
+ "--header" ,
241
+ action = "store_true" ,
242
+ help = "Print column labels. This is the default behavior." ,
243
+ )
244
+ parser_header_group .add_argument (
245
+ "--no-header" ,
246
+ action = "store_true" ,
247
+ help = "Do not print column labels." ,
248
+ )
249
+
250
+ parser_index_group = parser .add_mutually_exclusive_group (required = False )
251
+ parser_index_group .add_argument (
252
+ "--index" ,
253
+ action = "store_true" ,
254
+ help = "Print index (auto-incrementing row labels as left untitled column). This is the default behavior." ,
255
+ )
256
+ parser_index_group .add_argument (
257
+ "--no-index" ,
258
+ action = "store_true" ,
259
+ help = "Do not print index. If output is JSON, --json-orient must be 'split' or 'table'." ,
260
+ )
261
+
262
+ parser .add_argument ("in_graph" , nargs = "+" )
263
+ args = parser .parse_args ()
264
+
265
+ output_mode : str
266
+ if args .out_table .endswith (".csv" ):
267
+ output_mode = "csv"
268
+ elif args .out_table .endswith (".html" ):
269
+ output_mode = "html"
270
+ elif args .out_table .endswith (".json" ):
271
+ output_mode = "json"
272
+ elif args .out_table .endswith (".md" ):
273
+ output_mode = "md"
274
+ elif args .out_table .endswith (".tsv" ):
275
+ output_mode = "tsv"
276
+ else :
277
+ raise NotImplementedError ("Output file extension not implemented." )
278
+
279
+ graph = rdflib .Graph ()
280
+ for in_graph_filename in args .in_graph :
281
+ graph .parse (in_graph_filename )
282
+
283
+ select_query_text : typing .Optional [str ] = None
284
+ with open (args .in_sparql , "r" ) as in_fh :
285
+ select_query_text = in_fh .read ().strip ()
286
+ if select_query_text is None :
287
+ raise ValueError ("Failed to load query." )
288
+ _logger .debug ("select_query_text = %r." % select_query_text )
289
+
290
+ # Process --header and --no-header.
291
+ use_header : bool
292
+ if args .header is True :
293
+ use_header = True
294
+ if args .no_header is True :
295
+ use_header = False
296
+ else :
297
+ use_header = True
298
+
299
+ # Process --index and --no-index.
300
+ use_index : bool
301
+ if args .index is True :
302
+ use_index = True
303
+ if args .no_index is True :
304
+ use_index = False
305
+ else :
306
+ use_index = True
307
+
308
+ if (
309
+ output_mode == "json"
310
+ and use_index is False
311
+ and args .json_orient not in {"split" , "table" }
312
+ ):
313
+ raise ValueError (
314
+ "For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
175
315
)
176
316
317
+ df = graph_and_query_to_data_frame (
318
+ graph ,
319
+ select_query_text ,
320
+ built_version = args .built_version ,
321
+ disallow_empty_results = args .disallow_empty_results is True ,
322
+ use_prefixes = args .use_prefixes is True ,
323
+ )
324
+
325
+ table_text = data_frame_to_table_text (
326
+ df ,
327
+ json_indent = args .json_indent ,
328
+ json_orient = args .json_orient ,
329
+ output_mode = output_mode ,
330
+ use_header = use_header ,
331
+ use_index = use_index ,
332
+ )
177
333
with open (args .out_table , "w" ) as out_fh :
178
334
out_fh .write (table_text )
179
335
if table_text [- 1 ] != "\n " :
0 commit comments