diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index b0f14f39..14a494a7 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -129,6 +129,17 @@ def reset(self):
self.framesetOK = True
+ @property
+ def documentEncoding(self):
+ """The name of the character encoding
+ that was used to decode the input stream,
+ or :obj:`None` if that is not determined yet.
+
+ """
+ if not hasattr(self, 'tokenizer'):
+ return None
+ return self.tokenizer.stream.documentEncoding
+
def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]):
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 004bdd4a..1fb38a2f 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object):
_defaultChunkSize = 10240
+ documentEncoding = None # No encoding involved for Unicode input.
+
def __init__(self, source):
"""Initialises the HTMLInputStream.
@@ -413,6 +415,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
# Call superclass
self.reset()
+ @property
+ def documentEncoding(self):
+ return self.charEncoding[0]
+
def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index f314421d..fb713761 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -26,12 +26,35 @@ def test_codec_name_d(self):
self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
+def test_unicode_input_encoding():
+ p = HTMLParser()
+ assert p.documentEncoding is None
+ p.parse(b'', useChardet=False)
+ assert p.documentEncoding == 'iso8859-2'
+
+ p = HTMLParser()
+ assert p.documentEncoding is None
+ p.parse('')
+ assert p.documentEncoding is None
+
+ p = HTMLParser()
+ assert p.documentEncoding is None
+ try:
+ p.parse('', encoding='latin3')
+ except TypeError:
+ pass
+ else:
+ assert 0, 'Expected TypeError'
+ assert p.documentEncoding is None
+
+
def runParserEncodingTest(data, encoding):
p = HTMLParser()
+ assert p.documentEncoding is None
p.parse(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
- assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0])
+ assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def runPreScanEncodingTest(data, encoding):