Skip to content

Commit 4cfda38

Browse files
committed
Fix BOM / encoding problems
* enable testhst_lhs_007, testhst_lhs_008 and testhst_lhs_009 for InputStream * disable those tests on readers, as readers bypass any encoding * do not try to discover the encoding used when the input is given a Reader * add an SIO-8859-1 encoded coment in the test xml (testEncodingISO_8859_1_newReader and testEncodingISO_8859_1_InputStream_encoded tests do decode it wrongly as they use UTF-8)
1 parent 1b71a4e commit 4cfda38

File tree

4 files changed

+37
-90
lines changed

4 files changed

+37
-90
lines changed

src/main/java/org/codehaus/plexus/util/xml/XmlReader.java

+2-5
Original file line numberDiff line numberDiff line change
@@ -523,11 +523,8 @@ else if ( bomEnc.equals( UTF_8 ) )
523523
}
524524
else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
525525
{
526-
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
527-
{
528-
throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
529-
}
530-
if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
526+
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc )
527+
|| xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
531528
{
532529
throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
533530
bomEnc, xmlGuessEnc, xmlEnc, is );

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+17-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import java.io.UnsupportedEncodingException;
1717

1818
import org.codehaus.plexus.util.xml.XmlStreamReader;
19+
import org.codehaus.plexus.util.xml.XmlStreamReaderException;
20+
1921

2022
//TODO best handling of interning issues
2123
// have isAllNewStringInterned ???
@@ -121,6 +123,7 @@ private String newStringIntern( char[] cbuf, int off, int len )
121123
// private String elValue[];
122124
private int elNamespaceCount[];
123125

126+
private String fileEncoding = null;
124127

125128
/**
126129
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
@@ -678,14 +681,26 @@ public void setInput( java.io.InputStream inputStream, String inputEncoding )
678681
}
679682
else
680683
{
681-
reader = new XmlStreamReader( inputStream );
684+
reader = new XmlStreamReader( inputStream, false );
682685
}
683686
}
684687
catch ( UnsupportedEncodingException une )
685688
{
686689
throw new XmlPullParserException( "could not create reader for encoding " + inputEncoding + " : " + une,
687690
this, une );
688691
}
692+
catch ( XmlStreamReaderException e )
693+
{
694+
if ( "UTF-8".equals( e.getBomEncoding() ) )
695+
{
696+
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + e.getXmlEncoding() + " is incompatible", this, e );
697+
}
698+
if ( e.getBomEncoding() != null && e.getBomEncoding().startsWith( "UTF-16" ) )
699+
{
700+
throw new XmlPullParserException( "UTF-16 BOM in a " + e.getXmlEncoding() + " encoded file is incompatible", this, e );
701+
}
702+
throw new XmlPullParserException( "could not create reader : " + e, this, e );
703+
}
689704
catch ( IOException e )
690705
{
691706
throw new XmlPullParserException( "could not create reader : " + e, this, e );
@@ -3414,7 +3429,7 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
34143429
final int encodingEnd = pos - 1;
34153430

34163431
// TODO reconcile with setInput encodingName
3417-
// inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
3432+
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
34183433

34193434
lastParsedAttr = "encoding";
34203435

src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java

+17-83
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
import static org.junit.Assert.fail;
55

66
import java.io.File;
7+
import java.io.FileInputStream;
78
import java.io.FileReader;
89
import java.io.IOException;
10+
import java.io.InputStream;
911
import java.io.Reader;
1012
import java.nio.charset.StandardCharsets;
1113

@@ -206,24 +208,21 @@ public void testhst_bh_006()
206208
* Version:
207209
*
208210
* @throws java.io.IOException if there is an I/O error
209-
*
210-
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-8 file
211-
* has a BOM or not
212211
*/
213-
// @Test
212+
@Test
214213
public void testhst_lhs_007()
215214
throws IOException
216215
{
217-
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "007.xml" ) ) )
216+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) ) )
218217
{
219-
parser.setInput( reader );
218+
parser.setInput( is, null );
220219
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
221220
;
222-
fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" );
221+
fail( "UTF-8 BOM plus xml decl of ISO-8859-1 incompatible" );
223222
}
224223
catch ( XmlPullParserException e )
225224
{
226-
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) );
225+
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible" ) );
227226
}
228227
}
229228

@@ -235,57 +234,24 @@ public void testhst_lhs_007()
235234
* Version:
236235
*
237236
* @throws java.io.IOException if there is an I/O error
238-
*
239-
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-16 file
240-
* has a BOM or not
241237
*/
242-
// @Test
243-
public void testhst_lhs_008_newReader()
238+
@Test
239+
public void testhst_lhs_008()
244240
throws IOException
245241
{
246-
try ( Reader reader =
247-
ReaderFactory.newReader( new File( testResourcesDir, "008.xml" ), StandardCharsets.UTF_16.name() ) )
242+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) ) )
248243
{
249-
parser.setInput( reader );
244+
parser.setInput( is, null );
250245
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
251246
;
252-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
247+
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-16 coding) incompatible" );
253248
}
254249
catch ( XmlPullParserException e )
255250
{
256251
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
257252
}
258253
}
259254

260-
/**
261-
* Test ID: <pre>hst-lhs-008</pre>
262-
* Test URI: <pre>008.xml</pre>
263-
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible</pre>
264-
* Sections: <pre>4.3.3</pre>
265-
* Version:
266-
*
267-
* @throws java.io.IOException if there is an I/O error
268-
*
269-
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
270-
* UTF-8, and XmlReader in lenient mode does not throw exception.
271-
*/
272-
// @Test
273-
public void testhst_lhs_008_XmlReader()
274-
throws IOException
275-
{
276-
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "008.xml" ) ) )
277-
{
278-
parser.setInput( reader );
279-
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
280-
;
281-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
282-
}
283-
catch ( XmlPullParserException e )
284-
{
285-
assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) );
286-
}
287-
}
288-
289255
/**
290256
* Test ID: <pre>hst-lhs-009</pre>
291257
* Test URI: <pre>009.xml</pre>
@@ -298,52 +264,20 @@ public void testhst_lhs_008_XmlReader()
298264
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
299265
* UTF-8.
300266
*/
301-
// @Test
302-
public void testhst_lhs_009_newReader()
303-
throws IOException
304-
{
305-
try ( Reader reader =
306-
ReaderFactory.newReader( new File( testResourcesDir, "009.xml" ), StandardCharsets.UTF_16.name() ) )
307-
{
308-
parser.setInput( reader );
309-
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
310-
;
311-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
312-
}
313-
catch ( XmlPullParserException e )
314-
{
315-
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
316-
}
317-
}
318-
319-
/**
320-
* Test ID: <pre>hst-lhs-009</pre>
321-
* Test URI: <pre>009.xml</pre>
322-
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible</pre>
323-
* Sections: <pre>4.3.3</pre>
324-
* Version:
325-
*
326-
* @throws java.io.IOException if there is an I/O error
327-
*/
328267
@Test
329-
public void testhst_lhs_009_XmlReader()
268+
public void testhst_lhs_009()
330269
throws IOException
331270
{
332-
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "009.xml" ) ) )
271+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) ) )
333272
{
334-
parser.setInput( reader );
273+
parser.setInput( is, null );
335274
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
336275
;
337-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
338-
}
339-
catch ( IOException e )
340-
{
341-
// even when XmlReader is in lenient mode, it throws an IOException
342-
assertTrue( e.getMessage().contains( "Invalid encoding, BOM [UTF-16BE] XML guess [UTF-8] XML prolog [UTF-8] encoding mismatch" ) );
276+
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-8 coding) incompatible" );
343277
}
344278
catch ( XmlPullParserException e )
345279
{
346-
fail( "Encoding problem should be detected by the XmlReader" );
280+
assertTrue( e.getMessage(), e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
347281
}
348282
}
349283

Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
<?xml version="1.0" encoding="ISO-8859-1"?>
2+
<!-- Æ -->
23
<mytag/>

0 commit comments

Comments
 (0)