diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index b0f14f39..14a494a7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -129,6 +129,17 @@ def reset(self): self.framesetOK = True + @property + def documentEncoding(self): + """The name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.documentEncoding + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 004bdd4a..1fb38a2f 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 + documentEncoding = None # No encoding involved for Unicode input. + def __init__(self, source): """Initialises the HTMLInputStream. @@ -413,6 +415,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Call superclass self.reset() + @property + def documentEncoding(self): + return self.charEncoding[0] + def reset(self): self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, 'replace') diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index f314421d..fb713761 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -26,12 +26,35 @@ def test_codec_name_d(self): self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") +def test_unicode_input_encoding(): + p = HTMLParser() + assert p.documentEncoding is None + p.parse(b'', useChardet=False) + assert p.documentEncoding == 'iso8859-2' + + p = HTMLParser() + assert p.documentEncoding is None + p.parse('') + assert p.documentEncoding is None + + p = HTMLParser() + assert p.documentEncoding is None + try: + p.parse('', encoding='latin3') + except TypeError: + pass + else: + assert 0, 'Expected TypeError' + assert p.documentEncoding is None + + def runParserEncodingTest(data, encoding): p = HTMLParser() + assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") - assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0]) + assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) def runPreScanEncodingTest(data, encoding):