Skip to content

Commit ff26c75

Browse files
committed
Fix BOM / encoding problems
* enable testhst_lhs_007, testhst_lhs_008 and testhst_lhs_009 for InputStream * disable those tests on readers, as readers bypass any encoding * do not try to discover the encoding used when the input is given a Reader * add an SIO-8859-1 encoded coment in the test xml (testEncodingISO_8859_1_newReader and testEncodingISO_8859_1_InputStream_encoded tests do decode it wrongly as they use UTF-8)
1 parent 831f645 commit ff26c75

File tree

4 files changed

+40
-92
lines changed

4 files changed

+40
-92
lines changed

src/main/java/org/codehaus/plexus/util/xml/XmlReader.java

+2-5
Original file line numberDiff line numberDiff line change
@@ -504,11 +504,8 @@ else if ( bomEnc.equals( UTF_8 ) )
504504
}
505505
else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
506506
{
507-
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
508-
{
509-
throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
510-
}
511-
if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
507+
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc )
508+
|| xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
512509
{
513510
throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
514511
bomEnc, xmlGuessEnc, xmlEnc, is );

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+20-4
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@
1111

1212
import java.io.EOFException;
1313
import java.io.IOException;
14+
import java.io.InputStreamReader;
1415
import java.io.Reader;
1516
import java.io.UnsupportedEncodingException;
1617

17-
import org.codehaus.plexus.util.ReaderFactory;
18+
import org.codehaus.plexus.util.xml.XmlReader;
19+
import org.codehaus.plexus.util.xml.XmlStreamReader;
20+
import org.codehaus.plexus.util.xml.XmlStreamReaderException;
1821

1922
//import java.util.Hashtable;
2023

@@ -122,6 +125,7 @@ private String newStringIntern( char[] cbuf, int off, int len )
122125
// private String elValue[];
123126
private int elNamespaceCount[];
124127

128+
private String fileEncoding = null;
125129

126130
/**
127131
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
@@ -675,18 +679,30 @@ public void setInput( java.io.InputStream inputStream, String inputEncoding )
675679
{
676680
if ( inputEncoding != null )
677681
{
678-
reader = ReaderFactory.newReader( inputStream, inputEncoding );
682+
reader = new InputStreamReader( inputStream, inputEncoding );
679683
}
680684
else
681685
{
682-
reader = ReaderFactory.newXmlReader( inputStream );
686+
reader = new XmlStreamReader( inputStream, false );
683687
}
684688
}
685689
catch ( UnsupportedEncodingException une )
686690
{
687691
throw new XmlPullParserException( "could not create reader for encoding " + inputEncoding + " : " + une,
688692
this, une );
689693
}
694+
catch ( XmlStreamReaderException e )
695+
{
696+
if ( "UTF-8".equals( e.getBomEncoding() ) )
697+
{
698+
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + e.getXmlEncoding() + " is incompatible", this, e );
699+
}
700+
if ( e.getBomEncoding() != null && e.getBomEncoding().startsWith( "UTF-16" ) )
701+
{
702+
throw new XmlPullParserException( "UTF-16 BOM in a " + e.getXmlEncoding() + " encoded file is incompatible", this, e );
703+
}
704+
throw new XmlPullParserException( "could not create reader : " + e, this, e );
705+
}
690706
catch ( IOException e )
691707
{
692708
throw new XmlPullParserException( "could not create reader : " + e, this, e );
@@ -3415,7 +3431,7 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
34153431
final int encodingEnd = pos - 1;
34163432

34173433
// TODO reconcile with setInput encodingName
3418-
// inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
3434+
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
34193435

34203436
lastParsedAttr = "encoding";
34213437

src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java

+17-83
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
import static org.junit.Assert.fail;
55

66
import java.io.File;
7+
import java.io.FileInputStream;
78
import java.io.FileReader;
89
import java.io.IOException;
10+
import java.io.InputStream;
911
import java.io.Reader;
1012
import java.nio.charset.StandardCharsets;
1113

@@ -206,24 +208,21 @@ public void testhst_bh_006()
206208
* Version:
207209
*
208210
* @throws java.io.IOException if there is an I/O error
209-
*
210-
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-8 file
211-
* has a BOM or not
212211
*/
213-
// @Test
212+
@Test
214213
public void testhst_lhs_007()
215214
throws IOException
216215
{
217-
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "007.xml" ) ) )
216+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) ) )
218217
{
219-
parser.setInput( reader );
218+
parser.setInput( is, null );
220219
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
221220
;
222-
fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" );
221+
fail( "UTF-8 BOM plus xml decl of ISO-8859-1 incompatible" );
223222
}
224223
catch ( XmlPullParserException e )
225224
{
226-
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) );
225+
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible" ) );
227226
}
228227
}
229228

@@ -235,57 +234,24 @@ public void testhst_lhs_007()
235234
* Version:
236235
*
237236
* @throws java.io.IOException if there is an I/O error
238-
*
239-
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-16 file
240-
* has a BOM or not
241237
*/
242-
// @Test
243-
public void testhst_lhs_008_newReader()
238+
@Test
239+
public void testhst_lhs_008()
244240
throws IOException
245241
{
246-
try ( Reader reader =
247-
ReaderFactory.newReader( new File( testResourcesDir, "008.xml" ), StandardCharsets.UTF_16.name() ) )
242+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) ) )
248243
{
249-
parser.setInput( reader );
244+
parser.setInput( is, null );
250245
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
251246
;
252-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
247+
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-16 coding) incompatible" );
253248
}
254249
catch ( XmlPullParserException e )
255250
{
256251
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
257252
}
258253
}
259254

260-
/**
261-
* Test ID: <pre>hst-lhs-008</pre>
262-
* Test URI: <pre>008.xml</pre>
263-
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible</pre>
264-
* Sections: <pre>4.3.3</pre>
265-
* Version:
266-
*
267-
* @throws java.io.IOException if there is an I/O error
268-
*
269-
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
270-
* UTF-8, and XmlReader in lenient mode does not throw exception.
271-
*/
272-
// @Test
273-
public void testhst_lhs_008_XmlReader()
274-
throws IOException
275-
{
276-
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "008.xml" ) ) )
277-
{
278-
parser.setInput( reader );
279-
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
280-
;
281-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
282-
}
283-
catch ( XmlPullParserException e )
284-
{
285-
assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) );
286-
}
287-
}
288-
289255
/**
290256
* Test ID: <pre>hst-lhs-009</pre>
291257
* Test URI: <pre>009.xml</pre>
@@ -298,52 +264,20 @@ public void testhst_lhs_008_XmlReader()
298264
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
299265
* UTF-8.
300266
*/
301-
// @Test
302-
public void testhst_lhs_009_newReader()
303-
throws IOException
304-
{
305-
try ( Reader reader =
306-
ReaderFactory.newReader( new File( testResourcesDir, "009.xml" ), StandardCharsets.UTF_16.name() ) )
307-
{
308-
parser.setInput( reader );
309-
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
310-
;
311-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
312-
}
313-
catch ( XmlPullParserException e )
314-
{
315-
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
316-
}
317-
}
318-
319-
/**
320-
* Test ID: <pre>hst-lhs-009</pre>
321-
* Test URI: <pre>009.xml</pre>
322-
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible</pre>
323-
* Sections: <pre>4.3.3</pre>
324-
* Version:
325-
*
326-
* @throws java.io.IOException if there is an I/O error
327-
*/
328267
@Test
329-
public void testhst_lhs_009_XmlReader()
268+
public void testhst_lhs_009()
330269
throws IOException
331270
{
332-
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "009.xml" ) ) )
271+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) ) )
333272
{
334-
parser.setInput( reader );
273+
parser.setInput( is, null );
335274
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
336275
;
337-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
338-
}
339-
catch ( IOException e )
340-
{
341-
// even when XmlReader is in lenient mode, it throws an IOException
342-
assertTrue( e.getMessage().contains( "Invalid encoding, BOM [UTF-16BE] XML guess [UTF-8] XML prolog [UTF-8] encoding mismatch" ) );
276+
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-8 coding) incompatible" );
343277
}
344278
catch ( XmlPullParserException e )
345279
{
346-
fail( "Encoding problem should be detected by the XmlReader" );
280+
assertTrue( e.getMessage(), e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
347281
}
348282
}
349283

Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
<?xml version="1.0" encoding="ISO-8859-1"?>
2+
<!-- Æ -->
23
<mytag/>

0 commit comments

Comments
 (0)