From 9aab9221cbc1ea301e8da5096e02653b58947e78 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 30 Nov 2013 23:01:43 +0000 Subject: [PATCH 1/2] Add a usedEncoding method to HTML5Parser, fix #121 --- html5lib/html5parser.py | 10 ++++++++++ html5lib/inputstream.py | 6 ++++++ html5lib/tests/test_encoding.py | 25 ++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index b0f14f39..f0121a4b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -129,6 +129,16 @@ def reset(self): self.framesetOK = True + def usedEncoding(self): + """Return the name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.usedEncoding() + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 004bdd4a..0275c0b8 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -175,6 +175,9 @@ def __init__(self, source): self.reset() + def usedEncoding(self): + return None # No encoding involved for Unicode input. + def reset(self): self.chunk = "" self.chunkSize = 0 @@ -413,6 +416,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Call superclass self.reset() + def usedEncoding(self): + return self.charEncoding[0] + def reset(self): self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, 'replace') diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index f314421d..6a1a6d0d 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -26,12 +26,35 @@ def test_codec_name_d(self): self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") +def test_unicode_input_encoding(): + p = HTMLParser() + assert p.usedEncoding() is None + p.parse(b'', useChardet=False) + assert p.usedEncoding() == 'iso8859-2' + + p = HTMLParser() + assert p.usedEncoding() is None + p.parse('') + assert p.usedEncoding() is None + + p = HTMLParser() + assert p.usedEncoding() is None + try: + p.parse('', encoding='latin3') + except TypeError: + pass + else: + assert 0, 'Expected TypeError' + assert p.usedEncoding() is None + + def runParserEncodingTest(data, encoding): p = HTMLParser() + assert p.usedEncoding() is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") - assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0]) + assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding()) def runPreScanEncodingTest(data, encoding): From 722bfd36028fc5dd65babb266499e3b0b1bb770b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 2 Dec 2013 14:44:02 +0000 Subject: [PATCH 2/2] Rename the usedEncoding method to documentEncoding, and make it a property. --- html5lib/html5parser.py | 7 ++++--- html5lib/inputstream.py | 8 ++++---- html5lib/tests/test_encoding.py | 16 ++++++++-------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index f0121a4b..14a494a7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -129,15 +129,16 @@ def reset(self): self.framesetOK = True - def usedEncoding(self): - """Return the name of the character encoding + @property + def documentEncoding(self): + """The name of the character encoding that was used to decode the input stream, or :obj:`None` if that is not determined yet. """ if not hasattr(self, 'tokenizer'): return None - return self.tokenizer.stream.usedEncoding() + return self.tokenizer.stream.documentEncoding def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 0275c0b8..1fb38a2f 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 + documentEncoding = None # No encoding involved for Unicode input. + def __init__(self, source): """Initialises the HTMLInputStream. @@ -175,9 +177,6 @@ def __init__(self, source): self.reset() - def usedEncoding(self): - return None # No encoding involved for Unicode input. - def reset(self): self.chunk = "" self.chunkSize = 0 @@ -416,7 +415,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Call superclass self.reset() - def usedEncoding(self): + @property + def documentEncoding(self): return self.charEncoding[0] def reset(self): diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 6a1a6d0d..fb713761 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -28,33 +28,33 @@ def test_codec_name_d(self): def test_unicode_input_encoding(): p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None p.parse(b'', useChardet=False) - assert p.usedEncoding() == 'iso8859-2' + assert p.documentEncoding == 'iso8859-2' p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None p.parse('') - assert p.usedEncoding() is None + assert p.documentEncoding is None p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None try: p.parse('', encoding='latin3') except TypeError: pass else: assert 0, 'Expected TypeError' - assert p.usedEncoding() is None + assert p.documentEncoding is None def runParserEncodingTest(data, encoding): p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") - assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding()) + assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) def runPreScanEncodingTest(data, encoding):