Skip to content

Commit 7565c56

Browse files
committed
#57 Uncaught IllegalArgumentException due to malformed unicode entity
ref - Added a more readable error message by means of a XmlPullParserException. - Improved validation of the numeric character reference, according to XML 1.0 spec. (https://www.w3.org/TR/REC-xml/#NT-Char)
1 parent c3e8d88 commit 7565c56

File tree

2 files changed

+79
-10
lines changed

2 files changed

+79
-10
lines changed

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+37-10
Original file line numberDiff line numberDiff line change
@@ -2664,7 +2664,8 @@ protected char[] parseEntityRef()
26642664
entityRefName = null;
26652665
posStart = pos;
26662666
char ch = more();
2667-
StringBuilder sb = new StringBuilder();
2667+
StringBuilder sb16 = new StringBuilder();
2668+
StringBuilder sb10 = new StringBuilder();
26682669
if ( ch == '#' )
26692670
{
26702671
// parse character reference
@@ -2679,17 +2680,17 @@ protected char[] parseEntityRef()
26792680
if ( ch >= '0' && ch <= '9' )
26802681
{
26812682
charRef = (char) ( charRef * 16 + ( ch - '0' ) );
2682-
sb.append( ch );
2683+
sb16.append( ch );
26832684
}
26842685
else if ( ch >= 'a' && ch <= 'f' )
26852686
{
26862687
charRef = (char) ( charRef * 16 + ( ch - ( 'a' - 10 ) ) );
2687-
sb.append( ch );
2688+
sb16.append( ch );
26882689
}
26892690
else if ( ch >= 'A' && ch <= 'F' )
26902691
{
26912692
charRef = (char) ( charRef * 16 + ( ch - ( 'A' - 10 ) ) );
2692-
sb.append( ch );
2693+
sb16.append( ch );
26932694
}
26942695
else if ( ch == ';' )
26952696
{
@@ -2710,6 +2711,7 @@ else if ( ch >= 'A' && ch <= 'F' )
27102711
if ( ch >= '0' && ch <= '9' )
27112712
{
27122713
charRef = (char) ( charRef * 10 + ( ch - '0' ) );
2714+
sb10.append( ch );
27132715
}
27142716
else if ( ch == ';' )
27152717
{
@@ -2724,16 +2726,35 @@ else if ( ch >= 'A' && ch <= 'F' )
27242726
}
27252727
}
27262728
posEnd = pos - 1;
2727-
if ( sb.length() > 0 )
2729+
if ( sb16.length() > 0 )
27282730
{
2729-
char[] tmp = toChars( Integer.parseInt( sb.toString(), 16 ) );
2730-
charRefOneCharBuf = tmp;
2731+
try
2732+
{
2733+
charRefOneCharBuf = toChars( Integer.parseInt( sb16.toString(), 16 ) );
2734+
}
2735+
catch ( IllegalArgumentException e )
2736+
{
2737+
throw new XmlPullParserException( "character reference (with hex value " + sb16.toString()
2738+
+ ") is invalid", this, null );
2739+
}
2740+
27312741
if ( tokenize )
27322742
{
27332743
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
27342744
}
27352745
return charRefOneCharBuf;
27362746
}
2747+
2748+
try
2749+
{
2750+
toChars( Integer.parseInt( sb10.toString(), 10 ) );
2751+
}
2752+
catch ( IllegalArgumentException e )
2753+
{
2754+
throw new XmlPullParserException( "character reference (with decimal value " + sb10.toString()
2755+
+ ") is invalid", this, null );
2756+
}
2757+
27372758
charRefOneCharBuf[0] = charRef;
27382759
if ( tokenize )
27392760
{
@@ -3996,15 +4017,21 @@ private static boolean isHighSurrogate( char ch )
39964017
return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
39974018
}
39984019

3999-
private static final int MIN_CODE_POINT = 0x000000;
4000-
40014020
private static final int MAX_CODE_POINT = 0x10FFFF;
40024021

40034022
private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
40044023

4024+
/**
4025+
* Check if the provided parameter is a valid Char, according to: {@link https://www.w3.org/TR/REC-xml/#NT-Char}
4026+
*
4027+
* @param codePoint the numeric value to check
4028+
* @return true if it is a valid numeric character reference. False otherwise.
4029+
*/
40054030
private static boolean isValidCodePoint( int codePoint )
40064031
{
4007-
return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
4032+
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
4033+
return codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || ( 0x20 <= codePoint && codePoint <= 0xD7FF )
4034+
|| ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0X10FFFF );
40084035
}
40094036

40104037
private static boolean isSupplementaryCodePoint( int codePoint )

src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java

+42
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
*/
1818

1919
import static org.junit.Assert.assertEquals;
20+
import static org.junit.Assert.assertTrue;
21+
import static org.junit.Assert.fail;
2022

2123
import java.io.IOException;
2224
import java.io.StringReader;
@@ -156,6 +158,46 @@ public void testUnicodeEntities()
156158
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
157159
}
158160

161+
@Test
162+
public void testInvalidCharacterReferenceHexa()
163+
throws Exception
164+
{
165+
MXParser parser = new MXParser();
166+
String input = "<root>&#x110000;</root>";
167+
parser.setInput( new StringReader( input ) );
168+
169+
try
170+
{
171+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
172+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
173+
fail( "Should fail since &#x110000; is an illegal character reference" );
174+
}
175+
catch ( XmlPullParserException e )
176+
{
177+
assertTrue( e.getMessage().contains( "character reference (with hex value 110000) is invalid" ) );
178+
}
179+
}
180+
181+
@Test
182+
public void testInvalidCharacterReferenceDecimal()
183+
throws Exception
184+
{
185+
MXParser parser = new MXParser();
186+
String input = "<root>&#1114112;</root>";
187+
parser.setInput( new StringReader( input ) );
188+
189+
try
190+
{
191+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
192+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
193+
fail( "Should fail since &#1114112; is an illegal character reference" );
194+
}
195+
catch ( XmlPullParserException e )
196+
{
197+
assertTrue( e.getMessage().contains( "character reference (with decimal value 1114112) is invalid" ) );
198+
}
199+
}
200+
159201
@Test
160202
public void testProcessingInstruction()
161203
throws Exception

0 commit comments

Comments
 (0)