Skip to content

Commit e351116

Browse files
committed
Update script and example for unescaping HTML.
1 parent 8309bb1 commit e351116

File tree

2 files changed

+23
-41
lines changed

2 files changed

+23
-41
lines changed

common/extract-examples.rb

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
require 'fileutils'
1616
require 'colorize'
1717
require 'yaml'
18+
require 'cgi'
1819

1920
PREFIXES = {
2021
dc: "http://purl.org/dc/terms/",
@@ -49,8 +50,8 @@
4950
# Remove highlighting and commented out sections
5051
def justify(str)
5152
str = str.
52-
sub(/^\s*<!--\s*$/, '').
53-
sub(/^\s*-->\s*$/, '').
53+
gsub(/^\s*<!--\s*$/, '').
54+
gsub(/^\s*-->\s*$/, '').
5455
gsub('****', '').
5556
gsub(/####([^#]*)####/, '')
5657

@@ -222,7 +223,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:)
222223
examples[title] = {
223224
title: title,
224225
filename: fn,
225-
content: content,
226+
content: content.to_s.gsub(/^\s*< !\s*-\s*-/, '<!--').gsub(/-\s*- >/, '-->'),
226227
content_type: element.attr('data-content-type'),
227228
number: example_number,
228229
ext: ext,
@@ -302,6 +303,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:)
302303
# Perform example syntactic validation based on extension
303304
case ex[:ext]
304305
when 'json', 'jsonld', 'jsonldf'
306+
content = CGI.unescapeHTML(content)
305307
begin
306308
::JSON.parse(content)
307309
rescue JSON::ParserError => exception
@@ -325,22 +327,17 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:)
325327
ex[:base] = html_base.to_s if html_base
326328

327329
script_content = doc.at_xpath(xpath)
328-
if script_content
329-
# Remove (faked) XML comments and unescape sequences
330-
content = script_content
331-
.inner_html
332-
.sub(/^\s*< !\s*-\s*-/, '')
333-
.sub(/-\s*- >\s*$/, '')
334-
.gsub(/&lt;/, '<')
335-
end
336-
330+
331+
# Remove (faked) XML comments and unescape sequences
332+
content = CGI.unescapeHTML(script_content.inner_html) if script_content
337333
rescue Nokogiri::XML::SyntaxError => exception
338334
errors << "Example #{ex[:number]} at line #{ex[:line]} parse error: #{exception.message}"
339335
$stdout.write "F".colorize(:red)
340336
next
341337
end
342338
when 'table'
343-
# already in parsed form
339+
doc = Nokogiri::HTML.parse(content) {|c| c.strict}
340+
content = CGI.unescapeHTML(doc.inner_html)
344341
when 'ttl', 'trig'
345342
begin
346343
reader_errors = []
@@ -443,10 +440,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:)
443440
# Set argument to referenced content to be parsed
444441
args[0] = if examples[ex[:result_for]][:ext] == 'html' && method == :expand
445442
# If we are expanding, and the reference is HTML, find the first script element.
446-
doc = Nokogiri::HTML.parse(
447-
examples[ex[:result_for]][:content]
448-
.sub(/^\s*< !\s*-\s*-/, '')
449-
.sub(/-\s*- >\s*$/, ''))
443+
doc = Nokogiri::HTML.parse(examples[ex[:result_for]][:content])
450444

451445
# Get base from document, if present
452446
html_base = doc.at_xpath('/html/head/base/@href')
@@ -458,15 +452,10 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:)
458452
$stdout.write "F".colorize(:red)
459453
next
460454
end
461-
StringIO.new(script_content
462-
.inner_html
463-
.gsub(/&lt;/, '<'))
455+
StringIO.new(CGI.unescapeHTML(script_content.inner_html))
464456
elsif examples[ex[:result_for]][:ext] == 'html' && ex[:target]
465457
# Only use the targeted script
466-
doc = Nokogiri::HTML.parse(
467-
examples[ex[:result_for]][:content]
468-
.sub(/^\s*< !\s*-\s*-/, '')
469-
.sub(/-\s*- >\s*$/, ''))
458+
doc = Nokogiri::HTML.parse(examples[ex[:result_for]][:content])
470459
script_content = doc.at_xpath(xpath)
471460
unless script_content
472461
errors << "Example #{ex[:number]} at line #{ex[:line]} references example #{ex[:result_for].inspect} with no JSON-LD script element"
@@ -565,7 +554,8 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:)
565554
$stderr.puts "expected:\n" + expected.to_trig if verbose
566555
when 'table'
567556
expected = begin
568-
table_to_dataset(content)
557+
doc = Nokogiri::HTML.parse(content)
558+
table_to_dataset(doc)
569559
rescue
570560
errors << "Example #{ex[:number]} at line #{ex[:line]} raised error reading table: #{$!}"
571561
RDF::Dataset.new

index.html

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8702,7 +8702,6 @@ <h3>Graph Containers</h3>
87028702
<div class="selectors">
87038703
<button class="selected" data-selects="original">Original</button>
87048704
<button data-selects="expanded">Expanded</button>
8705-
<button data-selects="statements">Statements</button>
87068705
<button data-selects="turtle">Turtle</button>
87078706
</div>
87088707
<pre class="original selected" data-transform="updateExample"
@@ -8720,36 +8719,29 @@ <h3>Graph Containers</h3>
87208719
</pre>
87218720
<pre class="expanded"
87228721
data-transform="updateExample"
8723-
data-result-for="Embedding JSON-LD in HTML with comments-original">
8722+
data-result-for="Embedding JSON-LD containing HTML in HTML-original">
87248723
<!--
87258724
[{
8726-
"@type": "http://schema.org/WebPageElement",
8725+
"@type": ["http://schema.org/WebPageElement"],
87278726
"http://schema.org/name": [{"@value": "Encoding Issues"}],
87288727
"http://schema.org/description": [
8729-
{"@value": "Issues list such as unescaped &lt;/script&gt; or --&gt;"}
8728+
{"@value": "Issues list such as unescaped </script> or -- >"}
87308729
]
87318730
}]
87328731
-->
87338732
</pre>
8734-
<table class="statements"
8735-
data-result-for="Embedding JSON-LD in HTML with comments-expanded"
8736-
data-to-rdf>
8737-
<thead><tr><th>Subject</th><th>Property</th><th>Value</th><th>Value Type</th></tr></thead>
8738-
<tbody>
8739-
<tr><td>_:b0</td><td>schema:name</td><td>Encoding Issues</td><td>&nbsp;</td></tr>
8740-
<tr><td>_:b0</td><td>schema:description</td><td>Issues list such as unescaped &amp;lt;/script&amp;gt; or --&amp;gt;</td><td>&nbsp;</td></tr>
8741-
</tbody>
8742-
</table>
87438733
<pre class="turtle"
87448734
data-content-type="text/turtle"
87458735
data-transform="updateExample"
8746-
data-result-for="Embedding JSON-LD in HTML with comments-expanded"
8736+
data-result-for="Embedding JSON-LD containing HTML in HTML-expanded"
87478737
data-to-rdf>
87488738
<!--
87498739
@prefix schema: <http://schema.org/> .
87508740
8751-
<> schema:name "Encoding Issues";
8752-
schema:description "Issues list such as unescaped &lt;/script&gt; or --&gt;" .
8741+
[ a schema:WebPageElement;
8742+
schema:name "Encoding Issues";
8743+
schema:description "Issues list such as unescaped </script> or -- >"
8744+
] .
87538745
-->
87548746
</pre>
87558747
</aside>

0 commit comments

Comments
 (0)