Skip to content

Commit 1dafbae

Browse files
belinguereshboutemy
authored andcommitted
#57 Uncaught IllegalArgumentExcept due to malformed unicode entity ref
- Added a more readable error message by means of a XmlPullParserException. - Improved validation of the numeric character reference, according to XML 1.0 spec. (https://www.w3.org/TR/REC-xml/#NT-Char) - Added tests for valid char references. - Catched and fixed wrong parsing bug for decimal >= &#10000 (supplemental) char refs.
1 parent dd8d35a commit 1dafbae

File tree

2 files changed

+153
-15
lines changed

2 files changed

+153
-15
lines changed

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+24-15
Original file line numberDiff line numberDiff line change
@@ -2664,13 +2664,16 @@ protected char[] parseEntityRef()
26642664
entityRefName = null;
26652665
posStart = pos;
26662666
char ch = more();
2667-
StringBuilder sb = new StringBuilder();
26682667
if ( ch == '#' )
26692668
{
26702669
// parse character reference
2670+
26712671
char charRef = 0;
26722672
ch = more();
2673-
if ( ch == 'x' )
2673+
StringBuilder sb = new StringBuilder();
2674+
boolean isHex = ( ch == 'x' );
2675+
2676+
if ( isHex )
26742677
{
26752678
// encoded in hex
26762679
while ( true )
@@ -2710,6 +2713,7 @@ else if ( ch >= 'A' && ch <= 'F' )
27102713
if ( ch >= '0' && ch <= '9' )
27112714
{
27122715
charRef = (char) ( charRef * 10 + ( ch - '0' ) );
2716+
sb.append( ch );
27132717
}
27142718
else if ( ch == ';' )
27152719
{
@@ -2724,20 +2728,19 @@ else if ( ch >= 'A' && ch <= 'F' )
27242728
}
27252729
}
27262730
posEnd = pos - 1;
2727-
if ( sb.length() > 0 )
2731+
try
27282732
{
2729-
char[] tmp = toChars( Integer.parseInt( sb.toString(), 16 ) );
2730-
charRefOneCharBuf = tmp;
2731-
if ( tokenize )
2732-
{
2733-
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
2734-
}
2735-
return charRefOneCharBuf;
2733+
charRefOneCharBuf = toChars( Integer.parseInt( sb.toString(), isHex ? 16 : 10 ) );
27362734
}
2737-
charRefOneCharBuf[0] = charRef;
2735+
catch ( IllegalArgumentException e )
2736+
{
2737+
throw new XmlPullParserException( "character reference (with " + ( isHex ? "hex" : "decimal" )
2738+
+ " value " + sb.toString() + ") is invalid", this, null );
2739+
}
2740+
27382741
if ( tokenize )
27392742
{
2740-
text = newString( charRefOneCharBuf, 0, 1 );
2743+
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
27412744
}
27422745
return charRefOneCharBuf;
27432746
}
@@ -3996,15 +3999,21 @@ private static boolean isHighSurrogate( char ch )
39963999
return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
39974000
}
39984001

3999-
private static final int MIN_CODE_POINT = 0x000000;
4000-
40014002
private static final int MAX_CODE_POINT = 0x10FFFF;
40024003

40034004
private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
40044005

4006+
/**
4007+
* Check if the provided parameter is a valid Char, according to: {@link https://www.w3.org/TR/REC-xml/#NT-Char}
4008+
*
4009+
* @param codePoint the numeric value to check
4010+
* @return true if it is a valid numeric character reference. False otherwise.
4011+
*/
40054012
private static boolean isValidCodePoint( int codePoint )
40064013
{
4007-
return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
4014+
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
4015+
return codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || ( 0x20 <= codePoint && codePoint <= 0xD7FF )
4016+
|| ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0x10FFFF );
40084017
}
40094018

40104019
private static boolean isSupplementaryCodePoint( int codePoint )

src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java

+129
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
*/
1818

1919
import static org.junit.Assert.assertEquals;
20+
import static org.junit.Assert.assertTrue;
21+
import static org.junit.Assert.fail;
2022

2123
import java.io.IOException;
2224
import java.io.StringReader;
@@ -156,6 +158,133 @@ public void testUnicodeEntities()
156158
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
157159
}
158160

161+
@Test
162+
public void testInvalidCharacterReferenceHexa()
163+
throws Exception
164+
{
165+
MXParser parser = new MXParser();
166+
String input = "<root>&#x110000;</root>";
167+
parser.setInput( new StringReader( input ) );
168+
169+
try
170+
{
171+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
172+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
173+
fail( "Should fail since &#x110000; is an illegal character reference" );
174+
}
175+
catch ( XmlPullParserException e )
176+
{
177+
assertTrue( e.getMessage().contains( "character reference (with hex value 110000) is invalid" ) );
178+
}
179+
}
180+
181+
@Test
182+
public void testValidCharacterReferenceHexa()
183+
throws Exception
184+
{
185+
MXParser parser = new MXParser();
186+
String input = "<root>&#x9;&#xA;&#xD;&#x20;&#x200;&#xD7FF;&#xE000;&#xFFA2;&#xFFFD;&#x10000;&#x10FFFD;&#x10FFFF;</root>";
187+
parser.setInput( new StringReader( input ) );
188+
189+
try
190+
{
191+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
192+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
193+
assertEquals( 0x9, parser.getText().codePointAt( 0 ) );
194+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
195+
assertEquals( 0xA, parser.getText().codePointAt( 0 ) );
196+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
197+
assertEquals( 0xD, parser.getText().codePointAt( 0 ) );
198+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
199+
assertEquals( 0x20, parser.getText().codePointAt( 0 ) );
200+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
201+
assertEquals( 0x200, parser.getText().codePointAt( 0 ) );
202+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
203+
assertEquals( 0xD7FF, parser.getText().codePointAt( 0 ) );
204+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
205+
assertEquals( 0xE000, parser.getText().codePointAt( 0 ) );
206+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
207+
assertEquals( 0xFFA2, parser.getText().codePointAt( 0 ) );
208+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
209+
assertEquals( 0xFFFD, parser.getText().codePointAt( 0 ) );
210+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
211+
assertEquals( 0x10000, parser.getText().codePointAt( 0 ) );
212+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
213+
assertEquals( 0x10FFFD, parser.getText().codePointAt( 0 ) );
214+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
215+
assertEquals( 0x10FFFF, parser.getText().codePointAt( 0 ) );
216+
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
217+
}
218+
catch ( XmlPullParserException e )
219+
{
220+
fail( "Should success since the input represents all legal character references" );
221+
}
222+
}
223+
224+
@Test
225+
public void testInvalidCharacterReferenceDecimal()
226+
throws Exception
227+
{
228+
MXParser parser = new MXParser();
229+
String input = "<root>&#1114112;</root>";
230+
parser.setInput( new StringReader( input ) );
231+
232+
try
233+
{
234+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
235+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
236+
fail( "Should fail since &#1114112; is an illegal character reference" );
237+
}
238+
catch ( XmlPullParserException e )
239+
{
240+
assertTrue( e.getMessage().contains( "character reference (with decimal value 1114112) is invalid" ) );
241+
}
242+
}
243+
244+
@Test
245+
public void testValidCharacterReferenceDecimal()
246+
throws Exception
247+
{
248+
MXParser parser = new MXParser();
249+
String input =
250+
"<root>&#9;&#10;&#13;&#32;&#512;&#55295;&#57344;&#65442;&#65533;&#65536;&#1114109;&#1114111;</root>";
251+
parser.setInput( new StringReader( input ) );
252+
253+
try
254+
{
255+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
256+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
257+
assertEquals( 9, parser.getText().codePointAt( 0 ) );
258+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
259+
assertEquals( 10, parser.getText().codePointAt( 0 ) );
260+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
261+
assertEquals( 13, parser.getText().codePointAt( 0 ) );
262+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
263+
assertEquals( 32, parser.getText().codePointAt( 0 ) );
264+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
265+
assertEquals( 512, parser.getText().codePointAt( 0 ) );
266+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
267+
assertEquals( 55295, parser.getText().codePointAt( 0 ) );
268+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
269+
assertEquals( 57344, parser.getText().codePointAt( 0 ) );
270+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
271+
assertEquals( 65442, parser.getText().codePointAt( 0 ) );
272+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
273+
assertEquals( 65533, parser.getText().codePointAt( 0 ) );
274+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
275+
assertEquals( 65536, parser.getText().codePointAt( 0 ) );
276+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
277+
assertEquals( 1114109, parser.getText().codePointAt( 0 ) );
278+
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
279+
assertEquals( 1114111, parser.getText().codePointAt( 0 ) );
280+
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
281+
}
282+
catch ( XmlPullParserException e )
283+
{
284+
fail( "Should success since the input represents all legal character references" );
285+
}
286+
}
287+
159288
@Test
160289
public void testProcessingInstruction()
161290
throws Exception

0 commit comments

Comments
 (0)