Skip to content

Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/main/java/org/codehaus/plexus/util/xml/XmlReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -523,11 +523,8 @@ else if ( bomEnc.equals( UTF_8 ) )
}
else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
{
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
{
throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
}
if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc )
|| xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
{
throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
bomEnc, xmlGuessEnc, xmlEnc, is );
Expand Down
42 changes: 14 additions & 28 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
import java.io.Reader;
import java.io.UnsupportedEncodingException;

import org.codehaus.plexus.util.xml.XmlReader;
import org.codehaus.plexus.util.xml.XmlStreamReader;
import org.codehaus.plexus.util.xml.XmlStreamReaderException;

//import java.util.Hashtable;

//TODO best handling of interning issues
// have isAllNewStringInterned ???
Expand Down Expand Up @@ -663,20 +662,6 @@ public void setInput( Reader in )
{
reset();
reader = in;

if ( reader instanceof XmlReader ) {
// encoding already detected
XmlReader xsr = (XmlReader) reader;
fileEncoding = xsr.getEncoding();
}
else if ( reader instanceof InputStreamReader )
{
InputStreamReader isr = (InputStreamReader) reader;
if ( isr.getEncoding() != null )
{
fileEncoding = isr.getEncoding().toUpperCase();
}
}
}

@Override
Expand All @@ -696,14 +681,26 @@ public void setInput( java.io.InputStream inputStream, String inputEncoding )
}
else
{
reader = new XmlStreamReader( inputStream );
reader = new XmlStreamReader( inputStream, false );
}
}
catch ( UnsupportedEncodingException une )
{
throw new XmlPullParserException( "could not create reader for encoding " + inputEncoding + " : " + une,
this, une );
}
catch ( XmlStreamReaderException e )
{
if ( "UTF-8".equals( e.getBomEncoding() ) )
{
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + e.getXmlEncoding() + " is incompatible", this, e );
}
if ( e.getBomEncoding() != null && e.getBomEncoding().startsWith( "UTF-16" ) )
{
throw new XmlPullParserException( "UTF-16 BOM in a " + e.getXmlEncoding() + " encoded file is incompatible", this, e );
}
throw new XmlPullParserException( "could not create reader : " + e, this, e );
}
catch ( IOException e )
{
throw new XmlPullParserException( "could not create reader : " + e, this, e );
Expand Down Expand Up @@ -3434,17 +3431,6 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
// TODO reconcile with setInput encodingName
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );

if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
{
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
{
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}

lastParsedAttr = "encoding";

ch = more();
Expand Down
104 changes: 95 additions & 9 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@

import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;

Expand Down Expand Up @@ -968,7 +971,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace()
* @since 3.4.1
*/
@Test
public void testEncodingISO_8859_1setInputReader()
public void testEncodingISO_8859_1_newXmlReader()
throws IOException
{
try ( Reader reader =
Expand All @@ -994,7 +997,7 @@ public void testEncodingISO_8859_1setInputReader()
* @since 3.4.1
*/
@Test
public void testEncodingISO_8859_1_setInputStream()
public void testEncodingISO_8859_1_InputStream()
throws IOException
{
try ( InputStream input =
Expand All @@ -1012,12 +1015,6 @@ public void testEncodingISO_8859_1_setInputStream()
}
}

private static void assertPosition( int row, int col, MXParser parser )
{
assertEquals( "Current line", row, parser.getLineNumber() );
assertEquals( "Current column", col, parser.getColumnNumber() );
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
Expand All @@ -1028,7 +1025,7 @@ private static void assertPosition( int row, int col, MXParser parser )
* @since 3.4.2
*/
@Test
public void testEncodingISO_8859_1setStringReader()
public void testEncodingISO_8859_1_StringReader()
throws IOException
{
String xmlFileContents;
Expand All @@ -1050,6 +1047,95 @@ public void testEncodingISO_8859_1setStringReader()
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding.
*
* @throws IOException if IO error.
*
* @since 3.5.2
*/
@Test
public void testEncodingISO_8859_1_newReader()
throws IOException
{
// NOTE: if using Files.newBufferedReader(path, StandardCharsets.UTF-8), the reader will throw an exception
// because the decoder created by new InputStreamReader() is lenient while the one created by
// Files.newBufferedReader() is not.
try ( Reader reader = new InputStreamReader( Files.newInputStream(
Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ),
StandardCharsets.UTF_8 ) )
{
MXParser parser = new MXParser();
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* Another case of bug #163: InputStream supplied with the right file encoding.
*
* @throws IOException if IO error.
*
* @since 3.5.2
*/
@Test
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException {
try ( InputStream input =
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
{
MXParser parser = new MXParser();
parser.setInput( input, StandardCharsets.UTF_8.name() );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* @throws IOException if IO error.
*
* @since 3.4.1
*/
@Test
public void testEncodingUTF8_newXmlReader()
throws IOException
{
try ( Reader reader = new XmlStreamReader( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
{
MXParser parser = new MXParser();
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

private static void assertPosition( int row, int col, MXParser parser )
{
assertEquals( "Current line", row, parser.getLineNumber() );
assertEquals( "Current column", col, parser.getColumnNumber() );
}

/**
* <p>
* Test custom Entity not found.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;

import org.junit.Before;
import org.junit.Test;

Expand Down Expand Up @@ -212,17 +210,16 @@ public void testhst_bh_006()
public void testhst_lhs_007()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) ) )
{
parser.setInput( reader );
parser.setInput( is, null );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" );
fail( "UTF-8 BOM plus xml decl of ISO-8859-1 incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) );
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible" ) );
}
}

Expand All @@ -239,17 +236,16 @@ public void testhst_lhs_007()
public void testhst_lhs_008()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) ) )
{
parser.setInput( reader );
parser.setInput( is, null );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-16 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) );
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
}
}

Expand All @@ -261,22 +257,24 @@ public void testhst_lhs_008()
* Version:
*
* @throws java.io.IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
* UTF-8.
*/
@Test
public void testhst_lhs_009()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
{
parser.setInput( reader );
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) ) )
{
parser.setInput( is, null );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-8 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
assertTrue( e.getMessage(), e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
}
}

Expand Down
Loading