html5lib · SimonSapin · Nov 30, 2013 · Dec 2, 2013
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -129,6 +129,17 @@ def reset(self):
 
         self.framesetOK = True
 
+    @property
+    def documentEncoding(self):
+        """The name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.documentEncoding
+
     def isHTMLIntegrationPoint(self, element):
         if (element.name == "annotation-xml" and
                 element.namespace == namespaces["mathml"]):

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object):
 
     _defaultChunkSize = 10240
 
+    documentEncoding = None  # No encoding involved for Unicode input.
+
     def __init__(self, source):
         """Initialises the HTMLInputStream.
 
@@ -413,6 +415,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         # Call superclass
         self.reset()
 
+    @property
+    def documentEncoding(self):
+        return self.charEncoding[0]
+
     def reset(self):
         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                  'replace')

diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -26,12 +26,35 @@ def test_codec_name_d(self):
         self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
 
 
+def test_unicode_input_encoding():
+    p = HTMLParser()
+    assert p.documentEncoding is None
+    p.parse(b'<meta charset=latin2>', useChardet=False)
+    assert p.documentEncoding == 'iso8859-2'
+
+    p = HTMLParser()
+    assert p.documentEncoding is None
+    p.parse('<meta charset=latin2>')
+    assert p.documentEncoding is None
+
+    p = HTMLParser()
+    assert p.documentEncoding is None
+    try:
+        p.parse('<meta charset=latin2>', encoding='latin3')
+    except TypeError:
+        pass
+    else:
+        assert 0, 'Expected TypeError'
+    assert p.documentEncoding is None
+
+
 def runParserEncodingTest(data, encoding):
     p = HTMLParser()
+    assert p.documentEncoding is None
     p.parse(data, useChardet=False)
     encoding = encoding.lower().decode("ascii")
 
-    assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0])
+    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
 
 
 def runPreScanEncodingTest(data, encoding):