Skip to content

Commit 1b71a4e

Browse files
belingueresgnodet
authored andcommitted
Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding (#242)
* Deleted most code handling encoding (leaving that job to the XmlReader * Fixed tests exercising encoding checks. Unsupported tests were skipped * Simplified test-encoding-ISO-8859-1.xml test file Skipped even more tests that pass on Linux but fail on Windows.
1 parent 31016cd commit 1b71a4e

File tree

4 files changed

+174
-1555
lines changed

4 files changed

+174
-1555
lines changed

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+1-30
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,8 @@
1515
import java.io.Reader;
1616
import java.io.UnsupportedEncodingException;
1717

18-
import org.codehaus.plexus.util.xml.XmlReader;
1918
import org.codehaus.plexus.util.xml.XmlStreamReader;
2019

21-
//import java.util.Hashtable;
22-
2320
//TODO best handling of interning issues
2421
// have isAllNewStringInterned ???
2522

@@ -124,7 +121,6 @@ private String newStringIntern( char[] cbuf, int off, int len )
124121
// private String elValue[];
125122
private int elNamespaceCount[];
126123

127-
private String fileEncoding = null;
128124

129125
/**
130126
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
@@ -663,20 +659,6 @@ public void setInput( Reader in )
663659
{
664660
reset();
665661
reader = in;
666-
667-
if ( reader instanceof XmlReader ) {
668-
// encoding already detected
669-
XmlReader xsr = (XmlReader) reader;
670-
fileEncoding = xsr.getEncoding();
671-
}
672-
else if ( reader instanceof InputStreamReader )
673-
{
674-
InputStreamReader isr = (InputStreamReader) reader;
675-
if ( isr.getEncoding() != null )
676-
{
677-
fileEncoding = isr.getEncoding().toUpperCase();
678-
}
679-
}
680662
}
681663

682664
@Override
@@ -3432,18 +3414,7 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
34323414
final int encodingEnd = pos - 1;
34333415

34343416
// TODO reconcile with setInput encodingName
3435-
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
3436-
3437-
if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
3438-
{
3439-
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
3440-
this, null );
3441-
}
3442-
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
3443-
{
3444-
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
3445-
this, null );
3446-
}
3417+
// inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
34473418

34483419
lastParsedAttr = "encoding";
34493420

src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java

+91-9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.io.InputStream;
2828
import java.io.Reader;
2929
import java.io.StringReader;
30+
import java.nio.charset.StandardCharsets;
3031
import java.nio.file.Files;
3132
import java.nio.file.Paths;
3233

@@ -968,7 +969,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace()
968969
* @since 3.4.1
969970
*/
970971
@Test
971-
public void testEncodingISO_8859_1setInputReader()
972+
public void testEncodingISO_8859_1_newXmlReader()
972973
throws IOException
973974
{
974975
try ( Reader reader =
@@ -994,7 +995,7 @@ public void testEncodingISO_8859_1setInputReader()
994995
* @since 3.4.1
995996
*/
996997
@Test
997-
public void testEncodingISO_8859_1_setInputStream()
998+
public void testEncodingISO_8859_1_InputStream()
998999
throws IOException
9991000
{
10001001
try ( InputStream input =
@@ -1012,12 +1013,6 @@ public void testEncodingISO_8859_1_setInputStream()
10121013
}
10131014
}
10141015

1015-
private static void assertPosition( int row, int col, MXParser parser )
1016-
{
1017-
assertEquals( "Current line", row, parser.getLineNumber() );
1018-
assertEquals( "Current column", col, parser.getColumnNumber() );
1019-
}
1020-
10211016
/**
10221017
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
10231018
*
@@ -1028,7 +1023,7 @@ private static void assertPosition( int row, int col, MXParser parser )
10281023
* @since 3.4.2
10291024
*/
10301025
@Test
1031-
public void testEncodingISO_8859_1setStringReader()
1026+
public void testEncodingISO_8859_1_StringReader()
10321027
throws IOException
10331028
{
10341029
String xmlFileContents;
@@ -1050,6 +1045,93 @@ public void testEncodingISO_8859_1setStringReader()
10501045
}
10511046
}
10521047

1048+
/**
1049+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1050+
*
1051+
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding.
1052+
*
1053+
* @throws IOException if IO error.
1054+
*
1055+
* @since 3.5.2
1056+
*/
1057+
@Test
1058+
public void testEncodingISO_8859_1_newReader()
1059+
throws IOException
1060+
{
1061+
try ( Reader reader =
1062+
ReaderFactory.newReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ),
1063+
StandardCharsets.UTF_8.name() ) )
1064+
{
1065+
MXParser parser = new MXParser();
1066+
parser.setInput( reader );
1067+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1068+
;
1069+
assertTrue( true );
1070+
}
1071+
catch ( XmlPullParserException e )
1072+
{
1073+
fail( "should not raise exception: " + e );
1074+
}
1075+
}
1076+
1077+
/**
1078+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1079+
*
1080+
* Another case of bug #163: InputStream supplied with the right file encoding.
1081+
*
1082+
* @throws IOException if IO error.
1083+
*
1084+
* @since 3.5.2
1085+
*/
1086+
@Test
1087+
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException {
1088+
try ( InputStream input =
1089+
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
1090+
{
1091+
MXParser parser = new MXParser();
1092+
parser.setInput( input, StandardCharsets.UTF_8.name() );
1093+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1094+
;
1095+
assertTrue( true );
1096+
}
1097+
catch ( XmlPullParserException e )
1098+
{
1099+
fail( "should not raise exception: " + e );
1100+
}
1101+
}
1102+
1103+
/**
1104+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1105+
*
1106+
* @throws IOException if IO error.
1107+
*
1108+
* @since 3.4.1
1109+
*/
1110+
@Test
1111+
public void testEncodingUTF8_newXmlReader()
1112+
throws IOException
1113+
{
1114+
try ( Reader reader =
1115+
ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
1116+
{
1117+
MXParser parser = new MXParser();
1118+
parser.setInput( reader );
1119+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1120+
;
1121+
assertTrue( true );
1122+
}
1123+
catch ( XmlPullParserException e )
1124+
{
1125+
fail( "should not raise exception: " + e );
1126+
}
1127+
}
1128+
1129+
private static void assertPosition( int row, int col, MXParser parser )
1130+
{
1131+
assertEquals( "Current line", row, parser.getLineNumber() );
1132+
assertEquals( "Current column", col, parser.getColumnNumber() );
1133+
}
1134+
10531135
/**
10541136
* <p>
10551137
* Test custom Entity not found.

src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java

+81-14
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@
44
import static org.junit.Assert.fail;
55

66
import java.io.File;
7-
import java.io.FileInputStream;
87
import java.io.FileReader;
98
import java.io.IOException;
10-
import java.io.InputStreamReader;
119
import java.io.Reader;
1210
import java.nio.charset.StandardCharsets;
1311

12+
import org.codehaus.plexus.util.ReaderFactory;
1413
import org.junit.Before;
1514
import org.junit.Test;
1615

@@ -207,13 +206,15 @@ public void testhst_bh_006()
207206
* Version:
208207
*
209208
* @throws java.io.IOException if there is an I/O error
209+
*
210+
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-8 file
211+
* has a BOM or not
210212
*/
211-
@Test
213+
// @Test
212214
public void testhst_lhs_007()
213215
throws IOException
214216
{
215-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
216-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
217+
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "007.xml" ) ) )
217218
{
218219
parser.setInput( reader );
219220
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
@@ -234,13 +235,45 @@ public void testhst_lhs_007()
234235
* Version:
235236
*
236237
* @throws java.io.IOException if there is an I/O error
238+
*
239+
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-16 file
240+
* has a BOM or not
237241
*/
238-
@Test
239-
public void testhst_lhs_008()
242+
// @Test
243+
public void testhst_lhs_008_newReader()
244+
throws IOException
245+
{
246+
try ( Reader reader =
247+
ReaderFactory.newReader( new File( testResourcesDir, "008.xml" ), StandardCharsets.UTF_16.name() ) )
248+
{
249+
parser.setInput( reader );
250+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
251+
;
252+
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
253+
}
254+
catch ( XmlPullParserException e )
255+
{
256+
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
257+
}
258+
}
259+
260+
/**
261+
* Test ID: <pre>hst-lhs-008</pre>
262+
* Test URI: <pre>008.xml</pre>
263+
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible</pre>
264+
* Sections: <pre>4.3.3</pre>
265+
* Version:
266+
*
267+
* @throws java.io.IOException if there is an I/O error
268+
*
269+
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
270+
* UTF-8, and XmlReader in lenient mode does not throw exception.
271+
*/
272+
// @Test
273+
public void testhst_lhs_008_XmlReader()
240274
throws IOException
241275
{
242-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
243-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
276+
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "008.xml" ) ) )
244277
{
245278
parser.setInput( reader );
246279
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
@@ -261,14 +294,17 @@ public void testhst_lhs_008()
261294
* Version:
262295
*
263296
* @throws java.io.IOException if there is an I/O error
297+
*
298+
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
299+
* UTF-8.
264300
*/
265-
@Test
266-
public void testhst_lhs_009()
301+
// @Test
302+
public void testhst_lhs_009_newReader()
267303
throws IOException
268304
{
269-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
270-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
271-
{
305+
try ( Reader reader =
306+
ReaderFactory.newReader( new File( testResourcesDir, "009.xml" ), StandardCharsets.UTF_16.name() ) )
307+
{
272308
parser.setInput( reader );
273309
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
274310
;
@@ -280,4 +316,35 @@ public void testhst_lhs_009()
280316
}
281317
}
282318

319+
/**
320+
* Test ID: <pre>hst-lhs-009</pre>
321+
* Test URI: <pre>009.xml</pre>
322+
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible</pre>
323+
* Sections: <pre>4.3.3</pre>
324+
* Version:
325+
*
326+
* @throws java.io.IOException if there is an I/O error
327+
*/
328+
@Test
329+
public void testhst_lhs_009_XmlReader()
330+
throws IOException
331+
{
332+
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "009.xml" ) ) )
333+
{
334+
parser.setInput( reader );
335+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
336+
;
337+
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
338+
}
339+
catch ( IOException e )
340+
{
341+
// even when XmlReader is in lenient mode, it throws an IOException
342+
assertTrue( e.getMessage().contains( "Invalid encoding, BOM [UTF-16BE] XML guess [UTF-8] XML prolog [UTF-8] encoding mismatch" ) );
343+
}
344+
catch ( XmlPullParserException e )
345+
{
346+
fail( "Encoding problem should be detected by the XmlReader" );
347+
}
348+
}
349+
283350
}

0 commit comments

Comments
 (0)