Skip to content

Commit 6cbe7e4

Browse files
committed
Replace attribute invalid characters with _, vs stripping
Fixes #2143
1 parent 68f6f9c commit 6cbe7e4

File tree

5 files changed

+18
-10
lines changed

5 files changed

+18
-10
lines changed

CHANGES.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
* Updated the `button` tag configuration to include a space between multiple button elements in the `Element.text()`
2121
method. [2105](https://github.com/jhy/jsoup/issues/2105)
2222
* Added support for the `ns|*` all elements in namespace Selector. [1811](https://github.com/jhy/jsoup/issues/1811)
23+
* When normalising attribute names during serialization, invalid characters are now replaced with `_`, vs being
24+
stripped. This should make the process clearer, and generally prevent an invalid attribute name being coerced
25+
unexpectedly. [2143](https://github.com/jhy/jsoup/issues/2143)
2326

2427
### Changes
2528

src/main/java/org/jsoup/nodes/Attribute.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,18 +174,23 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
174174
}
175175

176176
private static final Pattern xmlKeyValid = Pattern.compile("[a-zA-Z_:][-a-zA-Z0-9_:.]*");
177-
private static final Pattern xmlKeyReplace = Pattern.compile("[^-a-zA-Z0-9_:.]");
177+
private static final Pattern xmlKeyReplace = Pattern.compile("[^-a-zA-Z0-9_:.]+");
178178
private static final Pattern htmlKeyValid = Pattern.compile("[^\\x00-\\x1f\\x7f-\\x9f \"'/=]+");
179-
private static final Pattern htmlKeyReplace = Pattern.compile("[\\x00-\\x1f\\x7f-\\x9f \"'/=]");
179+
private static final Pattern htmlKeyReplace = Pattern.compile("[\\x00-\\x1f\\x7f-\\x9f \"'/=]+");
180180

181+
/**
182+
* Get a valid attribute key for the given syntax. If the key is not valid, it will be coerced into a valid key.
183+
* @param key the original attribute key
184+
* @param syntax HTML or XML
185+
* @return the original key if it's valid; a key with invalid characters replaced with "_" otherwise; or null if a valid key could not be created.
186+
*/
181187
@Nullable public static String getValidKey(String key, Syntax syntax) {
182-
// we consider HTML attributes to always be valid. XML checks key validity
183188
if (syntax == Syntax.xml && !xmlKeyValid.matcher(key).matches()) {
184-
key = xmlKeyReplace.matcher(key).replaceAll("");
189+
key = xmlKeyReplace.matcher(key).replaceAll("_");
185190
return xmlKeyValid.matcher(key).matches() ? key : null; // null if could not be coerced
186191
}
187192
else if (syntax == Syntax.html && !htmlKeyValid.matcher(key).matches()) {
188-
key = htmlKeyReplace.matcher(key).replaceAll("");
193+
key = htmlKeyReplace.matcher(key).replaceAll("_");
189194
return htmlKeyValid.matcher(key).matches() ? key : null; // null if could not be coerced
190195
}
191196
return key;

src/test/java/org/jsoup/helper/W3CDomTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ public void handlesInvalidAttributeNames() {
151151

152152
Document w3Doc = W3CDom.convert(jsoupDoc);
153153
String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml());
154-
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body name=\"\" style=\"color: red\"/></html>", xml);
154+
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body _=\"\" name_=\"\" style=\"color: red\"/></html>", xml);
155155
}
156156

157157
@Test
@@ -175,7 +175,7 @@ public void xmlInputDocMaintainsHtmlAttributeNames() {
175175

176176
Document w3Doc = W3CDom.convert(jsoupDoc);
177177
String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
178-
String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hnh=\"2\">unicode attr names coerced</p></body></html>";
178+
String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p h_nh=\"2\">unicode attr names coerced</p></body></html>";
179179
assertEquals(expected, TextUtil.stripNewlines(out));
180180
}
181181

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ private static Stream<Arguments> dupeAttributeData() {
8585
// NOTE: per spec this should be the test case. but impacts too many ppl
8686
// assertEquals("<p =a>One<a <p>Something</a></p>\n<a <p>Else</a>", doc.body().html());
8787

88-
assertEquals("<p a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));
88+
assertEquals("<p _a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));
8989

9090
doc = Jsoup.parse("<p .....>");
9191
assertEquals("<p .....></p>", doc.body().html());
@@ -1522,7 +1522,7 @@ private boolean didAddElements(String input) {
15221522
assertEquals(Document.OutputSettings.Syntax.html, doc.outputSettings().syntax());
15231523

15241524
String out = doc.body().outerHtml();
1525-
assertEquals("<body style=\"color: red\" name>\n <div></div>\n</body>", out);
1525+
assertEquals("<body style=\"color: red\" _ name_>\n <div _></div>\n</body>", out);
15261526
}
15271527

15281528
@Test void templateInHead() {

src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ public void handlesLTinScript() {
296296
assertEquals(Syntax.xml, doc.outputSettings().syntax());
297297

298298
String out = doc.html();
299-
assertEquals("<body style=\"color: red\" name=\"\"><div></div></body>", out);
299+
assertEquals("<body style=\"color: red\" _=\"\" name_=\"\"><div _=\"\"></div></body>", out);
300300
}
301301

302302
@Test void customTagsAreFlyweights() {

0 commit comments

Comments
 (0)