From 9aab9221cbc1ea301e8da5096e02653b58947e78 Mon Sep 17 00:00:00 2001
From: Simon Sapin <simon.sapin@exyr.org>
Date: Sat, 30 Nov 2013 23:01:43 +0000
Subject: [PATCH 1/2] Add a usedEncoding method to HTML5Parser, fix #121

---
 html5lib/html5parser.py         | 10 ++++++++++
 html5lib/inputstream.py         |  6 ++++++
 html5lib/tests/test_encoding.py | 25 ++++++++++++++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index b0f14f39..f0121a4b 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -129,6 +129,16 @@ def reset(self):
 
         self.framesetOK = True
 
+    def usedEncoding(self):
+        """Return the name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.usedEncoding()
+
     def isHTMLIntegrationPoint(self, element):
         if (element.name == "annotation-xml" and
                 element.namespace == namespaces["mathml"]):
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 004bdd4a..0275c0b8 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -175,6 +175,9 @@ def __init__(self, source):
 
         self.reset()
 
+    def usedEncoding(self):
+        return None  # No encoding involved for Unicode input.
+
     def reset(self):
         self.chunk = ""
         self.chunkSize = 0
@@ -413,6 +416,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         # Call superclass
         self.reset()
 
+    def usedEncoding(self):
+        return self.charEncoding[0]
+
     def reset(self):
         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                  'replace')
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index f314421d..6a1a6d0d 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -26,12 +26,35 @@ def test_codec_name_d(self):
         self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
 
 
+def test_unicode_input_encoding():
+    p = HTMLParser()
+    assert p.usedEncoding() is None
+    p.parse(b'<meta charset=latin2>', useChardet=False)
+    assert p.usedEncoding() == 'iso8859-2'
+
+    p = HTMLParser()
+    assert p.usedEncoding() is None
+    p.parse('<meta charset=latin2>')
+    assert p.usedEncoding() is None
+
+    p = HTMLParser()
+    assert p.usedEncoding() is None
+    try:
+        p.parse('<meta charset=latin2>', encoding='latin3')
+    except TypeError:
+        pass
+    else:
+        assert 0, 'Expected TypeError'
+    assert p.usedEncoding() is None
+
+
 def runParserEncodingTest(data, encoding):
     p = HTMLParser()
+    assert p.usedEncoding() is None
     p.parse(data, useChardet=False)
     encoding = encoding.lower().decode("ascii")
 
-    assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0])
+    assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding())
 
 
 def runPreScanEncodingTest(data, encoding):

From 722bfd36028fc5dd65babb266499e3b0b1bb770b Mon Sep 17 00:00:00 2001
From: Simon Sapin <simon.sapin@exyr.org>
Date: Mon, 2 Dec 2013 14:44:02 +0000
Subject: [PATCH 2/2] Rename the usedEncoding method to documentEncoding, and
 make it a property.

---
 html5lib/html5parser.py         |  7 ++++---
 html5lib/inputstream.py         |  8 ++++----
 html5lib/tests/test_encoding.py | 16 ++++++++--------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index f0121a4b..14a494a7 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -129,15 +129,16 @@ def reset(self):
 
         self.framesetOK = True
 
-    def usedEncoding(self):
-        """Return the name of the character encoding
+    @property
+    def documentEncoding(self):
+        """The name of the character encoding
         that was used to decode the input stream,
         or :obj:`None` if that is not determined yet.
 
         """
         if not hasattr(self, 'tokenizer'):
             return None
-        return self.tokenizer.stream.usedEncoding()
+        return self.tokenizer.stream.documentEncoding
 
     def isHTMLIntegrationPoint(self, element):
         if (element.name == "annotation-xml" and
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 0275c0b8..1fb38a2f 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object):
 
     _defaultChunkSize = 10240
 
+    documentEncoding = None  # No encoding involved for Unicode input.
+
     def __init__(self, source):
         """Initialises the HTMLInputStream.
 
@@ -175,9 +177,6 @@ def __init__(self, source):
 
         self.reset()
 
-    def usedEncoding(self):
-        return None  # No encoding involved for Unicode input.
-
     def reset(self):
         self.chunk = ""
         self.chunkSize = 0
@@ -416,7 +415,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         # Call superclass
         self.reset()
 
-    def usedEncoding(self):
+    @property
+    def documentEncoding(self):
         return self.charEncoding[0]
 
     def reset(self):
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index 6a1a6d0d..fb713761 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -28,33 +28,33 @@ def test_codec_name_d(self):
 
 def test_unicode_input_encoding():
     p = HTMLParser()
-    assert p.usedEncoding() is None
+    assert p.documentEncoding is None
     p.parse(b'<meta charset=latin2>', useChardet=False)
-    assert p.usedEncoding() == 'iso8859-2'
+    assert p.documentEncoding == 'iso8859-2'
 
     p = HTMLParser()
-    assert p.usedEncoding() is None
+    assert p.documentEncoding is None
     p.parse('<meta charset=latin2>')
-    assert p.usedEncoding() is None
+    assert p.documentEncoding is None
 
     p = HTMLParser()
-    assert p.usedEncoding() is None
+    assert p.documentEncoding is None
     try:
         p.parse('<meta charset=latin2>', encoding='latin3')
     except TypeError:
         pass
     else:
         assert 0, 'Expected TypeError'
-    assert p.usedEncoding() is None
+    assert p.documentEncoding is None
 
 
 def runParserEncodingTest(data, encoding):
     p = HTMLParser()
-    assert p.usedEncoding() is None
+    assert p.documentEncoding is None
     p.parse(data, useChardet=False)
     encoding = encoding.lower().decode("ascii")
 
-    assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding())
+    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
 
 
 def runPreScanEncodingTest(data, encoding):