From 6f7bb4a05254bb22de3b7c6dcc2ce69908fa8e4c Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 18:59:24 +0100 Subject: [PATCH 01/12] Placate pyflakes. This is mostly just removing dead variables, however, there are a few substantial changes in here: - Move to using try/except ImportError/else in tests where we are checking some module existing, as this was hiding genuine bugs that manifested themselves as ImportError (the etree treewalker was throwing ImportError when being imported). - Fixes the ImportError the etree treewalker was throwing (this was, too, reported as a bug by pyflakes, thereby showing its value). However, given it has been untested for a while, it is unsurprisingly broken, failing thousands of tests. - The parser defined the scriptDataDoubleEscapedDashState twice, therefore everything that should've been run in this state was in fact run in the scriptDataDoubleEscapedDashDashState. This also adds flake8 to Travis, albeit running it without any PEP 8 errors showing. --- .travis.yml | 10 +++++++++ html5lib/__init__.py | 2 ++ html5lib/html5parser.py | 16 ++++++-------- html5lib/inputstream.py | 5 +---- html5lib/serializer/htmlserializer.py | 3 --- html5lib/tests/__init__.py | 11 --------- html5lib/tests/support.py | 8 +++---- html5lib/tests/test_encoding.py | 12 +++++----- html5lib/tests/test_parser.py | 4 +--- html5lib/tests/test_parser2.py | 2 +- html5lib/tests/test_sanitizer.py | 4 ---- html5lib/tests/test_serializer.py | 4 +--- html5lib/tests/test_stream.py | 2 +- html5lib/tests/test_tokenizer.py | 4 ---- html5lib/tests/test_treewalkers.py | 32 ++++++++++----------------- html5lib/tests/tokenizertotree.py | 1 - html5lib/tokenizer.py | 6 ++--- html5lib/treebuilders/__init__.py | 2 -- html5lib/treebuilders/dom.py | 3 +-- html5lib/treebuilders/etree.py | 20 ++++------------- html5lib/treewalkers/_base.py | 4 +--- html5lib/treewalkers/dom.py | 1 - html5lib/treewalkers/etree.py | 4 +--- html5lib/treewalkers/genshistream.py | 1 - html5lib/treewalkers/lxmletree.py | 2 -- html5lib/treewalkers/simpletree.py | 2 +- html5lib/trie/datrie.py | 2 -- html5lib/utils.py | 1 - 28 files changed, 56 insertions(+), 112 deletions(-) diff --git a/.travis.yml b/.travis.yml index a48d27f5..8402ab18 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,14 @@ env: - USE_OPTIONAL=true - USE_OPTIONAL=false +matrix: + exclude: + - python: 3.3 + env: USE_OPTIONAL=false + include: + - python: 3.3 + env: USE_OPTIONAL=false FLAKE=true + before_install: - git submodule update --init --recursive @@ -19,9 +27,11 @@ install: - if [[ $TRAVIS_PYTHON_VERSION != 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi - if [[ $TRAVIS_PYTHON_VERSION == 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi + - if [[ $FLAKE == "true" ]]; then pip install --use-mirrors flake8; fi script: - nosetests + - if [[ $FLAKE == "true" ]]; then flake8 --exclude=E,W html5lib; fi after_script: - python debug-info.py diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 528da9fa..10e2b74c 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -18,4 +18,6 @@ from .treewalkers import getTreeWalker from .serializer import serialize +__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", + "getTreeWalker", "serialize"] __version__ = "1.0b1" diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 989691a4..9d319a5c 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,7 +1,6 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass -import sys import types from . import inputstream @@ -14,10 +13,10 @@ from . import utils from . import constants from .constants import spaceCharacters, asciiUpper2Lower -from .constants import formattingElements, specialElements -from .constants import headingElements, tableInsertModeElements -from .constants import cdataElements, rcdataElements, voidElements -from .constants import tokenTypes, ReparseException, namespaces, spaceCharacters +from .constants import specialElements +from .constants import headingElements +from .constants import cdataElements, rcdataElements +from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements def parse(doc, treebuilder="simpletree", encoding=None, @@ -88,7 +87,7 @@ def _parse(self, stream, innerHTML=False, container="div", try: self.mainLoop() break - except ReparseException as e: + except ReparseException: self.reset() def reset(self): @@ -405,7 +404,7 @@ def parseRCDataRawtext(self, token, contentType): """ assert contentType in ("RAWTEXT", "RCDATA") - element = self.tree.insertElement(token) + self.tree.insertElement(token) if contentType == "RAWTEXT": self.tokenizer.state = self.tokenizer.rawtextState @@ -1402,7 +1401,6 @@ def endTagFormatting(self, token): """The much-feared adoption agency algorithm""" # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency # XXX Better parseError messages appreciated. - name = token["name"] # Step 1 outerLoopCounter = 0 @@ -1620,7 +1618,7 @@ def endTagScript(self, token): #document.write works def endTagOther(self, token): - node = self.tree.openElements.pop() + self.tree.openElements.pop() self.parser.phase = self.parser.originalPhase class InTablePhase(Phase): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index ca2514e6..65875b85 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -3,7 +3,6 @@ import codecs import re -import types import sys from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase @@ -107,8 +106,7 @@ def _readFromBuffer(self, bytes): bytesToRead = len(bufferedData) - bufferOffset self.position = [bufferIndex, len(bufferedData)] bufferIndex += 1 - data = rv.append(bufferedData[bufferOffset: - bufferOffset + bytesToRead]) + rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead]) remainingBytes -= bytesToRead bufferOffset = 0 @@ -290,7 +288,6 @@ def characterErrorsUCS2(self, data): #Someone picked the wrong compile option #You lose skip = False - import sys for match in invalid_unicode_re.finditer(data): if skip: continue diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 550b4db2..ac6a4e41 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -24,8 +24,6 @@ else: unicode_encode_errors = "htmlentityreplace" - from html5lib.constants import entities - encode_entity_map = {} is_ucs4 = len("\U0010FFFF") == 1 for k, v in list(entities.items()): @@ -228,7 +226,6 @@ def serialize(self, treewalker, encoding=None): in_cdata = True elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) - attributes = [] for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()): #TODO: Add namespace support here k = attr_name diff --git a/html5lib/tests/__init__.py b/html5lib/tests/__init__.py index 903df92a..b8ce2de3 100644 --- a/html5lib/tests/__init__.py +++ b/html5lib/tests/__init__.py @@ -1,12 +1 @@ from __future__ import absolute_import, division, unicode_literals - -import sys -import os - -parent_path = os.path.abspath(os.path.join(os.path.split(__file__)[0], "..")) - -if not parent_path in sys.path: - sys.path.insert(0, parent_path) -del parent_path - -from . import support diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index c9c3236b..3dcdc39b 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -12,8 +12,7 @@ os.path.pardir, os.path.pardir))) -import html5lib -from html5lib import html5parser, treebuilders +from html5lib import treebuilders del base_path #Build a dict of avaliable trees @@ -43,10 +42,11 @@ pass try: - import lxml.etree as lxml - treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml") + import lxml.etree as lxml # flake8: noqa except ImportError: pass +else: + treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml") def get_data_files(subdirectory, files='*.dat'): return glob.glob(os.path.join(test_dir,subdirectory,files)) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 74730e60..769e5a55 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -import re import os import unittest @@ -27,7 +26,7 @@ def test_codec_name_d(self): def runParserEncodingTest(data, encoding): p = HTMLParser() - t = p.parse(data, useChardet=False) + p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0]) @@ -44,18 +43,17 @@ def runPreScanEncodingTest(data, encoding): def test_encoding(): for filename in get_data_files("encoding"): - test_name = os.path.basename(filename).replace('.dat',''). \ - replace('-','') tests = TestData(filename, b"data", encoding=None) for idx, test in enumerate(tests): yield (runParserEncodingTest, test[b'data'], test[b'encoding']) yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) try: - import chardet + import chardet # flake8: noqa +except ImportError: + print("chardet not found, skipping chardet tests") +else: def test_chardet(): data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb").read() encoding = inputstream.HTMLInputStream(data).charEncoding assert encoding[0].lower() == "big5" -except ImportError: - print("chardet not found, skipping chardet tests") diff --git a/html5lib/tests/test_parser.py b/html5lib/tests/test_parser.py index 0bcd9787..ae5b87fd 100644 --- a/html5lib/tests/test_parser.py +++ b/html5lib/tests/test_parser.py @@ -3,7 +3,6 @@ import os import sys import traceback -import io import warnings import re @@ -11,8 +10,7 @@ from .support import get_data_files from .support import TestData, convert, convertExpected, treeTypes -import html5lib -from html5lib import html5parser, treebuilders, constants +from html5lib import html5parser, constants #Run the parse error checks checkParseErrors = False diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 048f41dc..a3a58a2b 100755 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -2,7 +2,7 @@ import io -from . import support +from . import support # flake8: noqa from html5lib import html5parser from html5lib.constants import namespaces from html5lib.treebuilders import dom diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 2da80d39..ab5de5fe 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -1,9 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -import os -import sys -import unittest - try: import json except ImportError: diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index 21abc5ba..25eee1f0 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -import os import unittest from .support import get_data_files @@ -15,7 +14,7 @@ unittest.TestCase.assertEqual = unittest.TestCase.assertEquals import html5lib -from html5lib import html5parser, serializer, constants +from html5lib import serializer, constants from html5lib.treewalkers._base import TreeWalker optionals_loaded = [] @@ -172,6 +171,5 @@ def test_serializer(): for filename in get_data_files('serializer', '*.test'): with open(filename) as fp: tests = json.load(fp) - test_name = os.path.basename(filename).replace('.test','') for index, test in enumerate(tests['tests']): yield runSerializerTest, test["input"], test["expected"], test.get("options", {}) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index cc8035fd..cd4a8132 100755 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from . import support +from . import support # flake8: noqa import unittest, codecs from html5lib.inputstream import HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index c1be14cf..ddbdf03b 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -2,9 +2,6 @@ -import sys -import os -import io import warnings import re @@ -176,7 +173,6 @@ def testTokenizer(): for filename in get_data_files('tokenizer', '*.test'): with open(filename) as fp: tests = json.load(fp) - testName = os.path.basename(filename).replace(".test","") if 'tests' in tests: for index,test in enumerate(tests['tests']): if 'initialStates' not in test: diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 566acf81..a09dde7a 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -14,7 +14,6 @@ from .support import get_data_files, TestData, convertExpected from html5lib import html5parser, treewalkers, treebuilders, constants -from html5lib.filters.lint import Filter as LintFilter, LintError def PullDOMAdapter(node): from xml.dom import Node @@ -58,42 +57,35 @@ def PullDOMAdapter(node): #"supposed" to work try: import xml.etree.ElementTree as ElementTree +except ImportError: + pass +else: treeTypes['ElementTree'] = \ {"builder": treebuilders.getTreeBuilder("etree", ElementTree), "walker": treewalkers.getTreeWalker("etree", ElementTree)} -except ImportError: - try: - import elementtree.ElementTree as ElementTree - treeTypes['ElementTree'] = \ - {"builder": treebuilders.getTreeBuilder("etree", ElementTree), - "walker": treewalkers.getTreeWalker("etree", ElementTree)} - except ImportError: - pass try: import xml.etree.cElementTree as ElementTree +except ImportError: + pass +else: treeTypes['cElementTree'] = \ {"builder": treebuilders.getTreeBuilder("etree", ElementTree), "walker": treewalkers.getTreeWalker("etree", ElementTree)} -except ImportError: - try: - import cElementTree as ElementTree - treeTypes['cElementTree'] = \ - {"builder": treebuilders.getTreeBuilder("etree", ElementTree), - "walker": treewalkers.getTreeWalker("etree", ElementTree)} - except ImportError: - pass + try: - import lxml.etree as ElementTree + import lxml.etree as ElementTree # flake8: noqa +except ImportError: + pass +else: # treeTypes['lxml_as_etree'] = \ # {"builder": treebuilders.getTreeBuilder("etree", ElementTree), # "walker": treewalkers.getTreeWalker("etree", ElementTree)} treeTypes['lxml_native'] = \ {"builder": treebuilders.getTreeBuilder("lxml"), "walker": treewalkers.getTreeWalker("lxml")} -except ImportError: - pass + #Try whatever etree implementations are available from a list that are #"supposed" to work diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py index ddcaa69f..8668561d 100644 --- a/html5lib/tests/tokenizertotree.py +++ b/html5lib/tests/tokenizertotree.py @@ -7,7 +7,6 @@ import html5lib from . import support -from . import test_parser from . import test_tokenizer p = html5lib.HTMLParser() diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 72d3057a..dd54eb67 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -8,8 +8,8 @@ from collections import deque from .constants import spaceCharacters -from .constants import entitiesWindows1252, entities -from .constants import asciiLowercase, asciiLetters, asciiUpper2Lower +from .constants import entities +from .constants import asciiLetters, asciiUpper2Lower from .constants import digits, hexDigits, EOF from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters @@ -798,7 +798,7 @@ def scriptDataDoubleEscapedDashState(self): self.state = self.scriptDataDoubleEscapedState return True - def scriptDataDoubleEscapedDashState(self): + def scriptDataDoubleEscapedDashDashState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py index 122fdc2e..e44e9914 100755 --- a/html5lib/treebuilders/__init__.py +++ b/html5lib/treebuilders/__init__.py @@ -34,8 +34,6 @@ treeBuilderCache = {} -import sys - def getTreeBuilder(treeType, implementation=None, **kwargs): """Get a TreeBuilder class for various types of tree with built-in support diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 7c6358b7..f48a53fe 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -2,11 +2,10 @@ from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE -import re import weakref from . import _base -from html5lib import constants, ihatexml +from html5lib import constants from html5lib.constants import namespaces from html5lib.utils import moduleFactoryFactory diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 48c3ce7c..8dc9c86b 100755 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -190,7 +190,6 @@ def __init__(self): def testSerializer(element): rv = [] - finalText = None def serializeElement(element, indent=0): if not(hasattr(element, "tag")): element = element.getroot() @@ -204,10 +203,8 @@ def serializeElement(element, indent=0): rv.append(""%(element.text,)) elif element.tag == "DOCUMENT_ROOT": rv.append("#document") - if element.text: - rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) - if element.tail: - finalText = element.tail + assert element.text is None + assert element.tail is None elif element.tag == ElementTreeCommentType: rv.append("|%s"%(' '*indent, element.text)) else: @@ -245,15 +242,11 @@ def serializeElement(element, indent=0): rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) serializeElement(element, 0) - if finalText is not None: - rv.append("|%s\"%s\""%(' '*2, finalText)) - return "\n".join(rv) def tostring(element): """Serialize an element and its child nodes to a string""" rv = [] - finalText = None filter = ihatexml.InfosetFilter() def serializeElement(element): if type(element) == type(ElementTree.ElementTree): @@ -268,10 +261,8 @@ def serializeElement(element): else: rv.append(""%(element.text,)) elif element.tag == "DOCUMENT_ROOT": - if element.text: - rv.append(element.text) - if element.tail: - finalText = element.tail + assert element.text is None + assert element.tail is None for child in element: serializeElement(child) @@ -300,9 +291,6 @@ def serializeElement(element): serializeElement(element) - if finalText is not None: - rv.append("%s\""%(' '*2, finalText)) - return "".join(rv) class TreeBuilder(_base.TreeBuilder): diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index 43c3f8de..69da1af6 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -95,7 +95,7 @@ def unknown(self, nodeType): class RecursiveTreeWalker(TreeWalker): def walkChildren(self, node): - raise NodeImplementedError + raise NotImplementedError def element(self, node, namespace, name, attrs, hasChildren): if name in voidElements: @@ -137,7 +137,6 @@ def __iter__(self): details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] hasChildren = False - endTag = None if type == DOCTYPE: yield self.doctype(*details) @@ -154,7 +153,6 @@ def __iter__(self): yield token hasChildren = False else: - endTag = name yield self.startTag(namespace, name, attributes) elif type == COMMENT: diff --git a/html5lib/treewalkers/dom.py b/html5lib/treewalkers/dom.py index ddf4dc59..2739e7a4 100644 --- a/html5lib/treewalkers/dom.py +++ b/html5lib/treewalkers/dom.py @@ -6,7 +6,6 @@ _ = gettext.gettext from . import _base -from html5lib.constants import voidElements class TreeWalker(_base.NonRecursiveTreeWalker): def getNodeDetails(self, node): diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index a3cefdc6..2006cdf0 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -3,12 +3,10 @@ import gettext _ = gettext.gettext -import copy import re from . import _base -from html5lib.constants import voidElements -from html5lib.utils import moduleFactorFactory +from ..utils import moduleFactoryFactory tag_regexp = re.compile("{([^}]*)}(.*)") diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 88ab225e..365d6aec 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -3,7 +3,6 @@ from genshi.core import QName from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT -from genshi.output import NamespaceFlattener from . import _base diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index aa85e313..186f9082 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -5,12 +5,10 @@ from html5lib.treebuilders.etree import tag_regexp from gettext import gettext -import sys _ = gettext from . import _base -from html5lib.constants import voidElements from html5lib import ihatexml def ensure_str(s): diff --git a/html5lib/treewalkers/simpletree.py b/html5lib/treewalkers/simpletree.py index 48202036..a2abec85 100644 --- a/html5lib/treewalkers/simpletree.py +++ b/html5lib/treewalkers/simpletree.py @@ -47,7 +47,7 @@ def getNodeDetails(self, node): return _base.COMMENT, node.data else: - return _node.UNKNOWN, node.type + return _base.UNKNOWN, node.type def getFirstChild(self, node): if isinstance(node, tuple): # It might be the root Node diff --git a/html5lib/trie/datrie.py b/html5lib/trie/datrie.py index fc98bdc3..762b471f 100644 --- a/html5lib/trie/datrie.py +++ b/html5lib/trie/datrie.py @@ -1,7 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from itertools import chain - from datrie import Trie as DATrie from six import text_type diff --git a/html5lib/utils.py b/html5lib/utils.py index 8f5d5306..4363182b 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from sys import version_info from types import ModuleType class MethodDispatcher(dict): From e5b123efef2382d9939c501c2cb9ef248b360210 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 19:10:46 +0100 Subject: [PATCH 02/12] Fix assertion in etree treewalker, thereby making tests pass again. --- html5lib/treewalkers/etree.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 2006cdf0..57de4aa9 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -5,6 +5,8 @@ import re +from six import text_type + from . import _base from ..utils import moduleFactoryFactory @@ -49,7 +51,7 @@ def getNodeDetails(self, node): return _base.COMMENT, node.text else: - assert type(node.tag) in (str, str), type(node.tag) + assert type(node.tag) == text_type, type(node.tag) #This is assumed to be an ordinary element match = tag_regexp.match(node.tag) if match: From 627969f2f6ad48ba02c6cb3b26bded04414b1933 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 20:27:05 +0100 Subject: [PATCH 03/12] (Almost) comply with PEP 8. The two violations: - We don't in general comply with the 79 character per line limit. - constants.py violates (at least according to pep8) the hanging indent rule. On the whole, I disagree with the tool and have filed with regards to this. --- .travis.yml | 3 +- html5lib/constants.py | 465 ++++++++--------- html5lib/filters/inject_meta_charset.py | 31 +- html5lib/filters/lint.py | 5 +- html5lib/filters/optionaltags.py | 5 +- html5lib/filters/sanitizer.py | 4 +- html5lib/filters/whitespace.py | 5 +- html5lib/html5parser.py | 540 ++++++++++---------- html5lib/ihatexml.py | 130 ++++- html5lib/inputstream.py | 154 +++--- html5lib/sanitizer.py | 273 +++++----- html5lib/serializer/htmlserializer.py | 36 +- html5lib/tests/mockParser.py | 6 +- html5lib/tests/performance/concatenation.py | 4 + html5lib/tests/support.py | 29 +- html5lib/tests/test_encoding.py | 8 +- html5lib/tests/test_parser.py | 33 +- html5lib/tests/test_parser2.py | 46 +- html5lib/tests/test_sanitizer.py | 54 +- html5lib/tests/test_serializer.py | 15 +- html5lib/tests/test_stream.py | 12 +- html5lib/tests/test_tokenizer.py | 30 +- html5lib/tests/test_treewalkers.py | 101 ++-- html5lib/tests/test_whitespace_filter.py | 25 +- html5lib/tests/tokenizertotree.py | 15 +- html5lib/tokenizer.py | 246 +++++---- html5lib/treebuilders/__init__.py | 7 +- html5lib/treebuilders/_base.py | 77 +-- html5lib/treebuilders/dom.py | 164 +++--- html5lib/treebuilders/etree.py | 76 +-- html5lib/treebuilders/etree_lxml.py | 100 ++-- html5lib/treebuilders/simpletree.py | 43 +- html5lib/treewalkers/__init__.py | 1 + html5lib/treewalkers/_base.py | 23 +- html5lib/treewalkers/dom.py | 5 +- html5lib/treewalkers/etree.py | 11 +- html5lib/treewalkers/genshistream.py | 7 +- html5lib/treewalkers/lxmletree.py | 21 +- html5lib/treewalkers/pulldom.py | 5 +- html5lib/treewalkers/simpletree.py | 19 +- html5lib/trie/_base.py | 1 + html5lib/trie/datrie.py | 1 + html5lib/trie/py.py | 1 + html5lib/utils.py | 12 +- 44 files changed, 1531 insertions(+), 1318 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8402ab18..262df222 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,8 @@ install: script: - nosetests - - if [[ $FLAKE == "true" ]]; then flake8 --exclude=E,W html5lib; fi + - if [[ $FLAKE == "true" ]]; then find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501; fi + - if [[ $FLAKE == "true" ]]; then flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py; fi after_script: - python debug-info.py diff --git a/html5lib/constants.py b/html5lib/constants.py index 952fef41..1866dd78 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -1,300 +1,301 @@ from __future__ import absolute_import, division, unicode_literals -import string, gettext +import string +import gettext _ = gettext.gettext EOF = None E = { "null-character": - _("Null character in input stream, replaced with U+FFFD."), + _("Null character in input stream, replaced with U+FFFD."), "invalid-codepoint": - _("Invalid codepoint in stream."), + _("Invalid codepoint in stream."), "incorrectly-placed-solidus": - _("Solidus (/) incorrectly placed in tag."), + _("Solidus (/) incorrectly placed in tag."), "incorrect-cr-newline-entity": - _("Incorrect CR newline entity, replaced with LF."), + _("Incorrect CR newline entity, replaced with LF."), "illegal-windows-1252-entity": - _("Entity used with illegal number (windows-1252 reference)."), + _("Entity used with illegal number (windows-1252 reference)."), "cant-convert-numeric-entity": - _("Numeric entity couldn't be converted to character " - "(codepoint U+%(charAsInt)08x)."), + _("Numeric entity couldn't be converted to character " + "(codepoint U+%(charAsInt)08x)."), "illegal-codepoint-for-numeric-entity": - _("Numeric entity represents an illegal codepoint: " - "U+%(charAsInt)08x."), + _("Numeric entity represents an illegal codepoint: " + "U+%(charAsInt)08x."), "numeric-entity-without-semicolon": - _("Numeric entity didn't end with ';'."), + _("Numeric entity didn't end with ';'."), "expected-numeric-entity-but-got-eof": - _("Numeric entity expected. Got end of file instead."), + _("Numeric entity expected. Got end of file instead."), "expected-numeric-entity": - _("Numeric entity expected but none found."), + _("Numeric entity expected but none found."), "named-entity-without-semicolon": - _("Named entity didn't end with ';'."), + _("Named entity didn't end with ';'."), "expected-named-entity": - _("Named entity expected. Got none."), + _("Named entity expected. Got none."), "attributes-in-end-tag": - _("End tag contains unexpected attributes."), + _("End tag contains unexpected attributes."), 'self-closing-flag-on-end-tag': _("End tag contains unexpected self-closing flag."), "expected-tag-name-but-got-right-bracket": - _("Expected tag name. Got '>' instead."), + _("Expected tag name. Got '>' instead."), "expected-tag-name-but-got-question-mark": - _("Expected tag name. Got '?' instead. (HTML doesn't " - "support processing instructions.)"), + _("Expected tag name. Got '?' instead. (HTML doesn't " + "support processing instructions.)"), "expected-tag-name": - _("Expected tag name. Got something else instead"), + _("Expected tag name. Got something else instead"), "expected-closing-tag-but-got-right-bracket": - _("Expected closing tag. Got '>' instead. Ignoring ''."), + _("Expected closing tag. Got '>' instead. Ignoring ''."), "expected-closing-tag-but-got-eof": - _("Expected closing tag. Unexpected end of file."), + _("Expected closing tag. Unexpected end of file."), "expected-closing-tag-but-got-char": - _("Expected closing tag. Unexpected character '%(data)s' found."), + _("Expected closing tag. Unexpected character '%(data)s' found."), "eof-in-tag-name": - _("Unexpected end of file in the tag name."), + _("Unexpected end of file in the tag name."), "expected-attribute-name-but-got-eof": - _("Unexpected end of file. Expected attribute name instead."), + _("Unexpected end of file. Expected attribute name instead."), "eof-in-attribute-name": - _("Unexpected end of file in attribute name."), + _("Unexpected end of file in attribute name."), "invalid-character-in-attribute-name": _("Invalid character in attribute name"), "duplicate-attribute": - _("Dropped duplicate attribute on tag."), + _("Dropped duplicate attribute on tag."), "expected-end-of-tag-name-but-got-eof": - _("Unexpected end of file. Expected = or end of tag."), + _("Unexpected end of file. Expected = or end of tag."), "expected-attribute-value-but-got-eof": - _("Unexpected end of file. Expected attribute value."), + _("Unexpected end of file. Expected attribute value."), "expected-attribute-value-but-got-right-bracket": - _("Expected attribute value. Got '>' instead."), + _("Expected attribute value. Got '>' instead."), 'equals-in-unquoted-attribute-value': _("Unexpected = in unquoted attribute"), 'unexpected-character-in-unquoted-attribute-value': _("Unexpected character in unquoted attribute"), "invalid-character-after-attribute-name": - _("Unexpected character after attribute name."), + _("Unexpected character after attribute name."), "unexpected-character-after-attribute-value": - _("Unexpected character after attribute value."), + _("Unexpected character after attribute value."), "eof-in-attribute-value-double-quote": - _("Unexpected end of file in attribute value (\")."), + _("Unexpected end of file in attribute value (\")."), "eof-in-attribute-value-single-quote": - _("Unexpected end of file in attribute value (')."), + _("Unexpected end of file in attribute value (')."), "eof-in-attribute-value-no-quotes": - _("Unexpected end of file in attribute value."), + _("Unexpected end of file in attribute value."), "unexpected-EOF-after-solidus-in-tag": _("Unexpected end of file in tag. Expected >"), "unexpected-character-after-solidus-in-tag": _("Unexpected character after / in tag. Expected >"), "expected-dashes-or-doctype": - _("Expected '--' or 'DOCTYPE'. Not found."), + _("Expected '--' or 'DOCTYPE'. Not found."), "unexpected-bang-after-double-dash-in-comment": _("Unexpected ! after -- in comment"), "unexpected-space-after-double-dash-in-comment": _("Unexpected space after -- in comment"), "incorrect-comment": - _("Incorrect comment."), + _("Incorrect comment."), "eof-in-comment": - _("Unexpected end of file in comment."), + _("Unexpected end of file in comment."), "eof-in-comment-end-dash": - _("Unexpected end of file in comment (-)"), + _("Unexpected end of file in comment (-)"), "unexpected-dash-after-double-dash-in-comment": - _("Unexpected '-' after '--' found in comment."), + _("Unexpected '-' after '--' found in comment."), "eof-in-comment-double-dash": - _("Unexpected end of file in comment (--)."), + _("Unexpected end of file in comment (--)."), "eof-in-comment-end-space-state": - _("Unexpected end of file in comment."), + _("Unexpected end of file in comment."), "eof-in-comment-end-bang-state": - _("Unexpected end of file in comment."), + _("Unexpected end of file in comment."), "unexpected-char-in-comment": - _("Unexpected character in comment found."), + _("Unexpected character in comment found."), "need-space-after-doctype": - _("No space after literal string 'DOCTYPE'."), + _("No space after literal string 'DOCTYPE'."), "expected-doctype-name-but-got-right-bracket": - _("Unexpected > character. Expected DOCTYPE name."), + _("Unexpected > character. Expected DOCTYPE name."), "expected-doctype-name-but-got-eof": - _("Unexpected end of file. Expected DOCTYPE name."), + _("Unexpected end of file. Expected DOCTYPE name."), "eof-in-doctype-name": - _("Unexpected end of file in DOCTYPE name."), + _("Unexpected end of file in DOCTYPE name."), "eof-in-doctype": - _("Unexpected end of file in DOCTYPE."), + _("Unexpected end of file in DOCTYPE."), "expected-space-or-right-bracket-in-doctype": - _("Expected space or '>'. Got '%(data)s'"), + _("Expected space or '>'. Got '%(data)s'"), "unexpected-end-of-doctype": - _("Unexpected end of DOCTYPE."), + _("Unexpected end of DOCTYPE."), "unexpected-char-in-doctype": - _("Unexpected character in DOCTYPE."), + _("Unexpected character in DOCTYPE."), "eof-in-innerhtml": - _("XXX innerHTML EOF"), + _("XXX innerHTML EOF"), "unexpected-doctype": - _("Unexpected DOCTYPE. Ignored."), + _("Unexpected DOCTYPE. Ignored."), "non-html-root": - _("html needs to be the first start tag."), + _("html needs to be the first start tag."), "expected-doctype-but-got-eof": - _("Unexpected End of file. Expected DOCTYPE."), + _("Unexpected End of file. Expected DOCTYPE."), "unknown-doctype": - _("Erroneous DOCTYPE."), + _("Erroneous DOCTYPE."), "expected-doctype-but-got-chars": - _("Unexpected non-space characters. Expected DOCTYPE."), + _("Unexpected non-space characters. Expected DOCTYPE."), "expected-doctype-but-got-start-tag": - _("Unexpected start tag (%(name)s). Expected DOCTYPE."), + _("Unexpected start tag (%(name)s). Expected DOCTYPE."), "expected-doctype-but-got-end-tag": - _("Unexpected end tag (%(name)s). Expected DOCTYPE."), + _("Unexpected end tag (%(name)s). Expected DOCTYPE."), "end-tag-after-implied-root": - _("Unexpected end tag (%(name)s) after the (implied) root element."), + _("Unexpected end tag (%(name)s) after the (implied) root element."), "expected-named-closing-tag-but-got-eof": - _("Unexpected end of file. Expected end tag (%(name)s)."), + _("Unexpected end of file. Expected end tag (%(name)s)."), "two-heads-are-not-better-than-one": - _("Unexpected start tag head in existing head. Ignored."), + _("Unexpected start tag head in existing head. Ignored."), "unexpected-end-tag": - _("Unexpected end tag (%(name)s). Ignored."), + _("Unexpected end tag (%(name)s). Ignored."), "unexpected-start-tag-out-of-my-head": - _("Unexpected start tag (%(name)s) that can be in head. Moved."), + _("Unexpected start tag (%(name)s) that can be in head. Moved."), "unexpected-start-tag": - _("Unexpected start tag (%(name)s)."), + _("Unexpected start tag (%(name)s)."), "missing-end-tag": - _("Missing end tag (%(name)s)."), + _("Missing end tag (%(name)s)."), "missing-end-tags": - _("Missing end tags (%(name)s)."), + _("Missing end tags (%(name)s)."), "unexpected-start-tag-implies-end-tag": - _("Unexpected start tag (%(startName)s) " - "implies end tag (%(endName)s)."), + _("Unexpected start tag (%(startName)s) " + "implies end tag (%(endName)s)."), "unexpected-start-tag-treated-as": - _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), "deprecated-tag": - _("Unexpected start tag %(name)s. Don't use it!"), + _("Unexpected start tag %(name)s. Don't use it!"), "unexpected-start-tag-ignored": - _("Unexpected start tag %(name)s. Ignored."), + _("Unexpected start tag %(name)s. Ignored."), "expected-one-end-tag-but-got-another": - _("Unexpected end tag (%(gotName)s). " - "Missing end tag (%(expectedName)s)."), + _("Unexpected end tag (%(gotName)s). " + "Missing end tag (%(expectedName)s)."), "end-tag-too-early": - _("End tag (%(name)s) seen too early. Expected other end tag."), + _("End tag (%(name)s) seen too early. Expected other end tag."), "end-tag-too-early-named": - _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), "end-tag-too-early-ignored": - _("End tag (%(name)s) seen too early. Ignored."), + _("End tag (%(name)s) seen too early. Ignored."), "adoption-agency-1.1": - _("End tag (%(name)s) violates step 1, " - "paragraph 1 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 1, " + "paragraph 1 of the adoption agency algorithm."), "adoption-agency-1.2": - _("End tag (%(name)s) violates step 1, " - "paragraph 2 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 1, " + "paragraph 2 of the adoption agency algorithm."), "adoption-agency-1.3": - _("End tag (%(name)s) violates step 1, " - "paragraph 3 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 1, " + "paragraph 3 of the adoption agency algorithm."), "adoption-agency-4.4": - _("End tag (%(name)s) violates step 4, " - "paragraph 4 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 4, " + "paragraph 4 of the adoption agency algorithm."), "unexpected-end-tag-treated-as": - _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), "no-end-tag": - _("This element (%(name)s) has no end tag."), + _("This element (%(name)s) has no end tag."), "unexpected-implied-end-tag-in-table": - _("Unexpected implied end tag (%(name)s) in the table phase."), + _("Unexpected implied end tag (%(name)s) in the table phase."), "unexpected-implied-end-tag-in-table-body": - _("Unexpected implied end tag (%(name)s) in the table body phase."), + _("Unexpected implied end tag (%(name)s) in the table body phase."), "unexpected-char-implies-table-voodoo": - _("Unexpected non-space characters in " - "table context caused voodoo mode."), + _("Unexpected non-space characters in " + "table context caused voodoo mode."), "unexpected-hidden-input-in-table": - _("Unexpected input with type hidden in table context."), + _("Unexpected input with type hidden in table context."), "unexpected-form-in-table": - _("Unexpected form in table context."), + _("Unexpected form in table context."), "unexpected-start-tag-implies-table-voodoo": - _("Unexpected start tag (%(name)s) in " - "table context caused voodoo mode."), + _("Unexpected start tag (%(name)s) in " + "table context caused voodoo mode."), "unexpected-end-tag-implies-table-voodoo": - _("Unexpected end tag (%(name)s) in " - "table context caused voodoo mode."), + _("Unexpected end tag (%(name)s) in " + "table context caused voodoo mode."), "unexpected-cell-in-table-body": - _("Unexpected table cell start tag (%(name)s) " - "in the table body phase."), + _("Unexpected table cell start tag (%(name)s) " + "in the table body phase."), "unexpected-cell-end-tag": - _("Got table cell end tag (%(name)s) " - "while required end tags are missing."), + _("Got table cell end tag (%(name)s) " + "while required end tags are missing."), "unexpected-end-tag-in-table-body": - _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), + _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), "unexpected-implied-end-tag-in-table-row": - _("Unexpected implied end tag (%(name)s) in the table row phase."), + _("Unexpected implied end tag (%(name)s) in the table row phase."), "unexpected-end-tag-in-table-row": - _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), + _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), "unexpected-select-in-select": - _("Unexpected select start tag in the select phase " - "treated as select end tag."), + _("Unexpected select start tag in the select phase " + "treated as select end tag."), "unexpected-input-in-select": - _("Unexpected input start tag in the select phase."), + _("Unexpected input start tag in the select phase."), "unexpected-start-tag-in-select": - _("Unexpected start tag token (%(name)s in the select phase. " - "Ignored."), + _("Unexpected start tag token (%(name)s in the select phase. " + "Ignored."), "unexpected-end-tag-in-select": - _("Unexpected end tag (%(name)s) in the select phase. Ignored."), + _("Unexpected end tag (%(name)s) in the select phase. Ignored."), "unexpected-table-element-start-tag-in-select-in-table": - _("Unexpected table element start tag (%(name)s) in the select in table phase."), + _("Unexpected table element start tag (%(name)s) in the select in table phase."), "unexpected-table-element-end-tag-in-select-in-table": - _("Unexpected table element end tag (%(name)s) in the select in table phase."), + _("Unexpected table element end tag (%(name)s) in the select in table phase."), "unexpected-char-after-body": - _("Unexpected non-space characters in the after body phase."), + _("Unexpected non-space characters in the after body phase."), "unexpected-start-tag-after-body": - _("Unexpected start tag token (%(name)s)" - " in the after body phase."), + _("Unexpected start tag token (%(name)s)" + " in the after body phase."), "unexpected-end-tag-after-body": - _("Unexpected end tag token (%(name)s)" - " in the after body phase."), + _("Unexpected end tag token (%(name)s)" + " in the after body phase."), "unexpected-char-in-frameset": - _("Unexpected characters in the frameset phase. Characters ignored."), + _("Unexpected characters in the frameset phase. Characters ignored."), "unexpected-start-tag-in-frameset": - _("Unexpected start tag token (%(name)s)" - " in the frameset phase. Ignored."), + _("Unexpected start tag token (%(name)s)" + " in the frameset phase. Ignored."), "unexpected-frameset-in-frameset-innerhtml": - _("Unexpected end tag token (frameset) " - "in the frameset phase (innerHTML)."), + _("Unexpected end tag token (frameset) " + "in the frameset phase (innerHTML)."), "unexpected-end-tag-in-frameset": - _("Unexpected end tag token (%(name)s)" - " in the frameset phase. Ignored."), + _("Unexpected end tag token (%(name)s)" + " in the frameset phase. Ignored."), "unexpected-char-after-frameset": - _("Unexpected non-space characters in the " - "after frameset phase. Ignored."), + _("Unexpected non-space characters in the " + "after frameset phase. Ignored."), "unexpected-start-tag-after-frameset": - _("Unexpected start tag (%(name)s)" - " in the after frameset phase. Ignored."), + _("Unexpected start tag (%(name)s)" + " in the after frameset phase. Ignored."), "unexpected-end-tag-after-frameset": - _("Unexpected end tag (%(name)s)" - " in the after frameset phase. Ignored."), + _("Unexpected end tag (%(name)s)" + " in the after frameset phase. Ignored."), "unexpected-end-tag-after-body-innerhtml": - _("Unexpected end tag after body(innerHtml)"), + _("Unexpected end tag after body(innerHtml)"), "expected-eof-but-got-char": - _("Unexpected non-space characters. Expected end of file."), + _("Unexpected non-space characters. Expected end of file."), "expected-eof-but-got-start-tag": - _("Unexpected start tag (%(name)s)" - ". Expected end of file."), + _("Unexpected start tag (%(name)s)" + ". Expected end of file."), "expected-eof-but-got-end-tag": - _("Unexpected end tag (%(name)s)" - ". Expected end of file."), + _("Unexpected end tag (%(name)s)" + ". Expected end of file."), "eof-in-table": - _("Unexpected end of file. Expected table content."), + _("Unexpected end of file. Expected table content."), "eof-in-select": - _("Unexpected end of file. Expected select content."), + _("Unexpected end of file. Expected select content."), "eof-in-frameset": - _("Unexpected end of file. Expected frameset content."), + _("Unexpected end of file. Expected frameset content."), "eof-in-script-in-script": - _("Unexpected end of file. Expected script content."), + _("Unexpected end of file. Expected script content."), "eof-in-foreign-lands": - _("Unexpected end of file. Expected foreign content"), + _("Unexpected end of file. Expected foreign content"), "non-void-element-with-trailing-solidus": - _("Trailing solidus not allowed on element %(name)s"), + _("Trailing solidus not allowed on element %(name)s"), "unexpected-html-element-in-foreign-content": - _("Element %(name)s not allowed in a non-html context"), + _("Element %(name)s not allowed in a non-html context"), "unexpected-end-tag-before-html": _("Unexpected end tag (%(name)s) before html."), "XXX-undefined-error": - ("Undefined error (this sucks and should be fixed)"), + _("Undefined error (this sucks and should be fixed)"), } namespaces = { - "html":"http://www.w3.org/1999/xhtml", - "mathml":"http://www.w3.org/1998/Math/MathML", - "svg":"http://www.w3.org/2000/svg", - "xlink":"http://www.w3.org/1999/xlink", - "xml":"http://www.w3.org/XML/1998/namespace", - "xmlns":"http://www.w3.org/2000/xmlns/" + "html": "http://www.w3.org/1999/xhtml", + "mathml": "http://www.w3.org/1998/Math/MathML", + "svg": "http://www.w3.org/2000/svg", + "xlink": "http://www.w3.org/1999/xlink", + "xml": "http://www.w3.org/XML/1998/namespace", + "xmlns": "http://www.w3.org/2000/xmlns/" } scopingElements = frozenset(( @@ -454,8 +455,8 @@ digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) -asciiUpper2Lower = dict([(ord(c),ord(c.lower())) - for c in string.ascii_uppercase]) +asciiUpper2Lower = dict([(ord(c), ord(c.lower())) + for c in string.ascii_uppercase]) # Heading elements need to be ordered headingElements = ( @@ -501,8 +502,8 @@ "": frozenset(("irrelevant",)), "style": frozenset(("scoped",)), "img": frozenset(("ismap",)), - "audio": frozenset(("autoplay","controls")), - "video": frozenset(("autoplay","controls")), + "audio": frozenset(("autoplay", "controls")), + "video": frozenset(("autoplay", "controls")), "script": frozenset(("defer", "async")), "details": frozenset(("open",)), "datagrid": frozenset(("multiple", "disabled")), @@ -521,38 +522,38 @@ # entitiesWindows1252 has to be _ordered_ and needs to have an index. It # therefore can't be a frozenset. entitiesWindows1252 = ( - 8364, # 0x80 0x20AC EURO SIGN - 65533, # 0x81 UNDEFINED - 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK - 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK - 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK - 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS - 8224, # 0x86 0x2020 DAGGER - 8225, # 0x87 0x2021 DOUBLE DAGGER - 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT - 8240, # 0x89 0x2030 PER MILLE SIGN - 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON - 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE - 65533, # 0x8D UNDEFINED - 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON - 65533, # 0x8F UNDEFINED - 65533, # 0x90 UNDEFINED - 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK - 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK - 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK - 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK - 8226, # 0x95 0x2022 BULLET - 8211, # 0x96 0x2013 EN DASH - 8212, # 0x97 0x2014 EM DASH - 732, # 0x98 0x02DC SMALL TILDE - 8482, # 0x99 0x2122 TRADE MARK SIGN - 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON - 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE - 65533, # 0x9D UNDEFINED - 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON - 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS + 8364, # 0x80 0x20AC EURO SIGN + 65533, # 0x81 UNDEFINED + 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK + 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK + 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK + 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS + 8224, # 0x86 0x2020 DAGGER + 8225, # 0x87 0x2021 DOUBLE DAGGER + 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT + 8240, # 0x89 0x2030 PER MILLE SIGN + 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON + 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE + 65533, # 0x8D UNDEFINED + 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON + 65533, # 0x8F UNDEFINED + 65533, # 0x90 UNDEFINED + 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK + 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK + 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK + 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK + 8226, # 0x95 0x2022 BULLET + 8211, # 0x96 0x2013 EN DASH + 8212, # 0x97 0x2014 EM DASH + 732, # 0x98 0x02DC SMALL TILDE + 8482, # 0x99 0x2122 TRADE MARK SIGN + 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON + 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE + 65533, # 0x9D UNDEFINED + 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON + 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) @@ -2792,41 +2793,41 @@ } replacementCharacters = { - 0x0:"\uFFFD", - 0x0d:"\u000D", - 0x80:"\u20AC", - 0x81:"\u0081", - 0x81:"\u0081", - 0x82:"\u201A", - 0x83:"\u0192", - 0x84:"\u201E", - 0x85:"\u2026", - 0x86:"\u2020", - 0x87:"\u2021", - 0x88:"\u02C6", - 0x89:"\u2030", - 0x8A:"\u0160", - 0x8B:"\u2039", - 0x8C:"\u0152", - 0x8D:"\u008D", - 0x8E:"\u017D", - 0x8F:"\u008F", - 0x90:"\u0090", - 0x91:"\u2018", - 0x92:"\u2019", - 0x93:"\u201C", - 0x94:"\u201D", - 0x95:"\u2022", - 0x96:"\u2013", - 0x97:"\u2014", - 0x98:"\u02DC", - 0x99:"\u2122", - 0x9A:"\u0161", - 0x9B:"\u203A", - 0x9C:"\u0153", - 0x9D:"\u009D", - 0x9E:"\u017E", - 0x9F:"\u0178", + 0x0: "\uFFFD", + 0x0d: "\u000D", + 0x80: "\u20AC", + 0x81: "\u0081", + 0x81: "\u0081", + 0x82: "\u201A", + 0x83: "\u0192", + 0x84: "\u201E", + 0x85: "\u2026", + 0x86: "\u2020", + 0x87: "\u2021", + 0x88: "\u02C6", + 0x89: "\u2030", + 0x8A: "\u0160", + 0x8B: "\u2039", + 0x8C: "\u0152", + 0x8D: "\u008D", + 0x8E: "\u017D", + 0x8F: "\u008F", + 0x90: "\u0090", + 0x91: "\u2018", + 0x92: "\u2019", + 0x93: "\u201C", + 0x94: "\u201D", + 0x95: "\u2022", + 0x96: "\u2013", + 0x97: "\u2014", + 0x98: "\u02DC", + 0x99: "\u2122", + 0x9A: "\u0161", + 0x9B: "\u203A", + 0x9C: "\u0153", + 0x9D: "\u009D", + 0x9E: "\u017E", + 0x9F: "\u0178", } encodings = { @@ -3059,25 +3060,27 @@ 'x-x-big5': 'big5'} tokenTypes = { - "Doctype":0, - "Characters":1, - "SpaceCharacters":2, - "StartTag":3, - "EndTag":4, - "EmptyTag":5, - "Comment":6, - "ParseError":7 + "Doctype": 0, + "Characters": 1, + "SpaceCharacters": 2, + "StartTag": 3, + "EndTag": 4, + "EmptyTag": 5, + "Comment": 6, + "ParseError": 7 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"])) -prefixes = dict([(v,k) for k,v in namespaces.items()]) +prefixes = dict([(v, k) for k, v in namespaces.items()]) prefixes["http://www.w3.org/1998/Math/MathML"] = "math" + class DataLossWarning(UserWarning): pass + class ReparseException(Exception): pass diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py index 65a3e902..ca33b70b 100644 --- a/html5lib/filters/inject_meta_charset.py +++ b/html5lib/filters/inject_meta_charset.py @@ -2,6 +2,7 @@ from . import _base + class Filter(_base.Filter): def __init__(self, source, encoding): _base.Filter.__init__(self, source) @@ -20,21 +21,21 @@ def __iter__(self): elif type == "EmptyTag": if token["name"].lower() == "meta": - # replace charset with actual encoding - has_http_equiv_content_type = False - for (namespace,name),value in token["data"].items(): - if namespace != None: - continue - elif name.lower() == 'charset': - token["data"][(namespace,name)] = self.encoding - meta_found = True - break - elif name == 'http-equiv' and value.lower() == 'content-type': - has_http_equiv_content_type = True - else: - if has_http_equiv_content_type and (None, "content") in token["data"]: - token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding - meta_found = True + # replace charset with actual encoding + has_http_equiv_content_type = False + for (namespace, name), value in token["data"].items(): + if namespace is not None: + continue + elif name.lower() == 'charset': + token["data"][(namespace, name)] = self.encoding + meta_found = True + break + elif name == 'http-equiv' and value.lower() == 'content-type': + has_http_equiv_content_type = True + else: + if has_http_equiv_content_type and (None, "content") in token["data"]: + token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding + meta_found = True elif token["name"].lower() == "head" and not meta_found: # insert meta into empty head diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index bf98708d..d6f37cf4 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -9,7 +9,10 @@ from html5lib.constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -class LintError(Exception): pass + +class LintError(Exception): + pass + class Filter(_base.Filter): def __iter__(self): diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py index 39d93ea5..fefe0b30 100644 --- a/html5lib/filters/optionaltags.py +++ b/html5lib/filters/optionaltags.py @@ -2,6 +2,7 @@ from . import _base + class Filter(_base.Filter): def slider(self): previous1 = previous2 = None @@ -17,7 +18,7 @@ def __iter__(self): type = token["type"] if type == "StartTag": if (token["data"] or - not self.is_optional_start(token["name"], previous, next)): + not self.is_optional_start(token["name"], previous, next)): yield token elif type == "EndTag": if not self.is_optional_end(token["name"], next): @@ -75,7 +76,7 @@ def is_optional_start(self, tagname, previous, next): # omit the thead and tfoot elements' end tag when they are # immediately followed by a tbody element. See is_optional_end. if previous and previous['type'] == 'EndTag' and \ - previous['name'] in ('tbody','thead','tfoot'): + previous['name'] in ('tbody', 'thead', 'tfoot'): return False return next["name"] == 'tr' else: diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index adaee595..2692023d 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -3,8 +3,10 @@ from . import _base from html5lib.sanitizer import HTMLSanitizerMixin + class Filter(_base.Filter, HTMLSanitizerMixin): def __iter__(self): for token in _base.Filter.__iter__(self): token = self.sanitize_token(token) - if token: yield token + if token: + yield token diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py index c2b7fb12..1f309236 100644 --- a/html5lib/filters/whitespace.py +++ b/html5lib/filters/whitespace.py @@ -8,6 +8,7 @@ SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) + class Filter(_base.Filter): spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) @@ -17,7 +18,7 @@ def __iter__(self): for token in _base.Filter.__iter__(self): type = token["type"] if type == "StartTag" \ - and (preserve or token["name"] in self.spacePreserveElements): + and (preserve or token["name"] in self.spacePreserveElements): preserve += 1 elif type == "EndTag" and preserve: @@ -32,6 +33,6 @@ def __iter__(self): yield token + def collapse_spaces(text): return SPACES_REGEX.sub(' ', text) - diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 9d319a5c..dab175dd 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -19,6 +19,7 @@ from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements + def parse(doc, treebuilder="simpletree", encoding=None, namespaceHTMLElements=True): """Parse a string or file-like object into a tree""" @@ -26,30 +27,33 @@ def parse(doc, treebuilder="simpletree", encoding=None, p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parse(doc, encoding=encoding) + def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None, namespaceHTMLElements=True): tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parseFragment(doc, container=container, encoding=encoding) + def method_decorator_metaclass(function): class Decorated(type): def __new__(meta, classname, bases, classDict): for attributeName, attribute in classDict.items(): - if type(attribute) == types.FunctionType: + if isinstance(attribute, types.FunctionType): attribute = function(attribute) classDict[attributeName] = attribute - return type.__new__(meta, classname, bases, classDict) + return type.__new__(meta, classname, bases, classDict) return Decorated + class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" - def __init__(self, tree = simpletree.TreeBuilder, - tokenizer = tokenizer.HTMLTokenizer, strict = False, - namespaceHTMLElements = True, debug=False): + def __init__(self, tree=simpletree.TreeBuilder, + tokenizer=tokenizer.HTMLTokenizer, strict=False, + namespaceHTMLElements=True, debug=False): """ strict - raise an exception when a parse error is encountered @@ -94,7 +98,7 @@ def reset(self): self.tree.reset() self.firstStartTag = False self.errors = [] - self.log = [] #only used with debug mode + self.log = [] # only used with debug mode # "quirks" / "limited quirks" / "no quirks" self.compatMode = "no quirks" @@ -126,7 +130,7 @@ def reset(self): def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and - element.namespace == namespaces["mathml"]): + element.namespace == namespaces["mathml"]): return ("encoding" in element.attributes and element.attributes["encoding"].translate( asciiUpper2Lower) in @@ -177,7 +181,7 @@ def mainLoop(self): if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: - new_token= phase.processSpaceCharacters(new_token) + new_token = phase.processSpaceCharacters(new_token) elif type == StartTagToken: new_token = phase.processStartTag(new_token) elif type == EndTagToken: @@ -188,10 +192,9 @@ def mainLoop(self): new_token = phase.processDoctype(new_token) if (type == StartTagToken and token["selfClosing"] - and not token["selfClosingAcknowledged"]): + and not token["selfClosingAcknowledged"]): self.parseError("non-void-element-with-trailing-solidus", - {"name":token["name"]}) - + {"name": token["name"]}) # When the loop finishes it's EOF reprocess = True @@ -252,77 +255,77 @@ def normalizeToken(self, token): return token def adjustMathMLAttributes(self, token): - replacements = {"definitionurl":"definitionURL"} - for k,v in replacements.items(): + replacements = {"definitionurl": "definitionURL"} + for k, v in replacements.items(): if k in token["data"]: token["data"][v] = token["data"][k] del token["data"][k] def adjustSVGAttributes(self, token): replacements = { - "attributename":"attributeName", - "attributetype":"attributeType", - "basefrequency":"baseFrequency", - "baseprofile":"baseProfile", - "calcmode":"calcMode", - "clippathunits":"clipPathUnits", - "contentscripttype":"contentScriptType", - "contentstyletype":"contentStyleType", - "diffuseconstant":"diffuseConstant", - "edgemode":"edgeMode", - "externalresourcesrequired":"externalResourcesRequired", - "filterres":"filterRes", - "filterunits":"filterUnits", - "glyphref":"glyphRef", - "gradienttransform":"gradientTransform", - "gradientunits":"gradientUnits", - "kernelmatrix":"kernelMatrix", - "kernelunitlength":"kernelUnitLength", - "keypoints":"keyPoints", - "keysplines":"keySplines", - "keytimes":"keyTimes", - "lengthadjust":"lengthAdjust", - "limitingconeangle":"limitingConeAngle", - "markerheight":"markerHeight", - "markerunits":"markerUnits", - "markerwidth":"markerWidth", - "maskcontentunits":"maskContentUnits", - "maskunits":"maskUnits", - "numoctaves":"numOctaves", - "pathlength":"pathLength", - "patterncontentunits":"patternContentUnits", - "patterntransform":"patternTransform", - "patternunits":"patternUnits", - "pointsatx":"pointsAtX", - "pointsaty":"pointsAtY", - "pointsatz":"pointsAtZ", - "preservealpha":"preserveAlpha", - "preserveaspectratio":"preserveAspectRatio", - "primitiveunits":"primitiveUnits", - "refx":"refX", - "refy":"refY", - "repeatcount":"repeatCount", - "repeatdur":"repeatDur", - "requiredextensions":"requiredExtensions", - "requiredfeatures":"requiredFeatures", - "specularconstant":"specularConstant", - "specularexponent":"specularExponent", - "spreadmethod":"spreadMethod", - "startoffset":"startOffset", - "stddeviation":"stdDeviation", - "stitchtiles":"stitchTiles", - "surfacescale":"surfaceScale", - "systemlanguage":"systemLanguage", - "tablevalues":"tableValues", - "targetx":"targetX", - "targety":"targetY", - "textlength":"textLength", - "viewbox":"viewBox", - "viewtarget":"viewTarget", - "xchannelselector":"xChannelSelector", - "ychannelselector":"yChannelSelector", - "zoomandpan":"zoomAndPan" - } + "attributename": "attributeName", + "attributetype": "attributeType", + "basefrequency": "baseFrequency", + "baseprofile": "baseProfile", + "calcmode": "calcMode", + "clippathunits": "clipPathUnits", + "contentscripttype": "contentScriptType", + "contentstyletype": "contentStyleType", + "diffuseconstant": "diffuseConstant", + "edgemode": "edgeMode", + "externalresourcesrequired": "externalResourcesRequired", + "filterres": "filterRes", + "filterunits": "filterUnits", + "glyphref": "glyphRef", + "gradienttransform": "gradientTransform", + "gradientunits": "gradientUnits", + "kernelmatrix": "kernelMatrix", + "kernelunitlength": "kernelUnitLength", + "keypoints": "keyPoints", + "keysplines": "keySplines", + "keytimes": "keyTimes", + "lengthadjust": "lengthAdjust", + "limitingconeangle": "limitingConeAngle", + "markerheight": "markerHeight", + "markerunits": "markerUnits", + "markerwidth": "markerWidth", + "maskcontentunits": "maskContentUnits", + "maskunits": "maskUnits", + "numoctaves": "numOctaves", + "pathlength": "pathLength", + "patterncontentunits": "patternContentUnits", + "patterntransform": "patternTransform", + "patternunits": "patternUnits", + "pointsatx": "pointsAtX", + "pointsaty": "pointsAtY", + "pointsatz": "pointsAtZ", + "preservealpha": "preserveAlpha", + "preserveaspectratio": "preserveAspectRatio", + "primitiveunits": "primitiveUnits", + "refx": "refX", + "refy": "refY", + "repeatcount": "repeatCount", + "repeatdur": "repeatDur", + "requiredextensions": "requiredExtensions", + "requiredfeatures": "requiredFeatures", + "specularconstant": "specularConstant", + "specularexponent": "specularExponent", + "spreadmethod": "spreadMethod", + "startoffset": "startOffset", + "stddeviation": "stdDeviation", + "stitchtiles": "stitchTiles", + "surfacescale": "surfaceScale", + "systemlanguage": "systemLanguage", + "tablevalues": "tableValues", + "targetx": "targetX", + "targety": "targetY", + "textlength": "textLength", + "viewbox": "viewBox", + "viewtarget": "viewTarget", + "xchannelselector": "xChannelSelector", + "ychannelselector": "yChannelSelector", + "zoomandpan": "zoomAndPan" + } for originalName in list(token["data"].keys()): if originalName in replacements: svgName = replacements[originalName] @@ -331,19 +334,19 @@ def adjustSVGAttributes(self, token): def adjustForeignAttributes(self, token): replacements = { - "xlink:actuate":("xlink", "actuate", namespaces["xlink"]), - "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]), - "xlink:href":("xlink", "href", namespaces["xlink"]), - "xlink:role":("xlink", "role", namespaces["xlink"]), - "xlink:show":("xlink", "show", namespaces["xlink"]), - "xlink:title":("xlink", "title", namespaces["xlink"]), - "xlink:type":("xlink", "type", namespaces["xlink"]), - "xml:base":("xml", "base", namespaces["xml"]), - "xml:lang":("xml", "lang", namespaces["xml"]), - "xml:space":("xml", "space", namespaces["xml"]), - "xmlns":(None, "xmlns", namespaces["xmlns"]), - "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"]) - } + "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), + "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), + "xlink:href": ("xlink", "href", namespaces["xlink"]), + "xlink:role": ("xlink", "role", namespaces["xlink"]), + "xlink:show": ("xlink", "show", namespaces["xlink"]), + "xlink:title": ("xlink", "title", namespaces["xlink"]), + "xlink:type": ("xlink", "type", namespaces["xlink"]), + "xml:base": ("xml", "base", namespaces["xml"]), + "xml:lang": ("xml", "lang", namespaces["xml"]), + "xml:space": ("xml", "space", namespaces["xml"]), + "xmlns": (None, "xmlns", namespaces["xmlns"]), + "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"]) + } for originalName in token["data"].keys(): if originalName in replacements: @@ -359,20 +362,20 @@ def resetInsertionMode(self): # specification.) last = False newModes = { - "select":"inSelect", - "td":"inCell", - "th":"inCell", - "tr":"inRow", - "tbody":"inTableBody", - "thead":"inTableBody", - "tfoot":"inTableBody", - "caption":"inCaption", - "colgroup":"inColumnGroup", - "table":"inTable", - "head":"inBody", - "body":"inBody", - "frameset":"inFrameset", - "html":"beforeHead" + "select": "inSelect", + "td": "inCell", + "th": "inCell", + "tr": "inRow", + "tbody": "inTableBody", + "thead": "inTableBody", + "tfoot": "inTableBody", + "caption": "inCaption", + "colgroup": "inColumnGroup", + "table": "inTable", + "head": "inBody", + "body": "inBody", + "frameset": "inFrameset", + "html": "beforeHead" } for node in self.tree.openElements[::-1]: nodeName = node.name @@ -415,16 +418,18 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] + def getPhases(debug): def log(function): """Logger that records which phase processes each token""" type_names = dict((value, key) for key, value in constants.tokenTypes.items()) + def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: token = args[0] try: - info = {"type":type_names[token['type']]} + info = {"type": type_names[token['type']]} except: raise if token['type'] in constants.tagTokenTypes: @@ -475,8 +480,8 @@ def processStartTag(self, token): return self.startTagHandler[token["name"]](token) def startTagHtml(self, token): - if self.parser.firstStartTag == False and token["name"] == "html": - self.parser.parseError("non-html-root") + if not self.parser.firstStartTag and token["name"] == "html": + self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in token["data"].items(): @@ -500,8 +505,8 @@ def processDoctype(self, token): systemId = token["systemId"] correct = token["correct"] - if (name != "html" or publicId != None or - systemId != None and systemId != "about:legacy-compat"): + if (name != "html" or publicId is not None or + systemId is not None and systemId != "about:legacy-compat"): self.parser.parseError("unknown-doctype") if publicId is None: @@ -576,8 +581,8 @@ def processDoctype(self, token): or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and - systemId == None - or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + systemId is None + or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" elif (publicId.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", @@ -585,7 +590,7 @@ def processDoctype(self, token): or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and - systemId != None): + systemId is not None): self.parser.compatMode = "limited quirks" self.parser.phase = self.parser.phases["beforeHtml"] @@ -601,13 +606,13 @@ def processCharacters(self, token): def processStartTag(self, token): self.parser.parseError("expected-doctype-but-got-start-tag", - {"name": token["name"]}) + {"name": token["name"]}) self.anythingElse() return token def processEndTag(self, token): self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) + {"name": token["name"]}) self.anythingElse() return token @@ -616,7 +621,6 @@ def processEOF(self): self.anythingElse() return True - class BeforeHtmlPhase(Phase): # helper methods def insertHtmlElement(self): @@ -647,12 +651,11 @@ def processStartTag(self, token): def processEndTag(self, token): if token["name"] not in ("head", "body", "html", "br"): self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) + {"name": token["name"]}) else: self.insertHtmlElement() return token - class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) @@ -697,13 +700,13 @@ def endTagImplyHead(self, token): def endTagOther(self, token): self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) + {"name": token["name"]}) class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), @@ -722,7 +725,7 @@ def __init__(self, parser, tree): self.endTagHandler.default = self.endTagOther # the real thing - def processEOF (self): + def processEOF(self): self.anythingElse() return True @@ -766,7 +769,7 @@ def startTagTitle(self, token): self.parser.parseRCDataRawtext(token, "RCDATA") def startTagNoScriptNoFramesStyle(self, token): - #Need to decide whether to implement the scripting-disabled case + # Need to decide whether to implement the scripting-disabled case self.parser.parseRCDataRawtext(token, "RAWTEXT") def startTagScript(self, token): @@ -781,7 +784,7 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() - assert node.name == "head", "Expected head got %s"%node.name + assert node.name == "head", "Expected head got %s" % node.name self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -794,12 +797,10 @@ def endTagOther(self, token): def anythingElse(self): self.endTagHead(impliedTagToken("head")) - # XXX If we implement a parser for which scripting is disabled we need to # implement this phase. # # class InHeadNoScriptPhase(Phase): - class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) @@ -810,7 +811,7 @@ def __init__(self, parser, tree): ("frameset", self.startTagFrameset), (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title"), - self.startTagFromHead), + self.startTagFromHead), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther @@ -840,7 +841,7 @@ def startTagFrameset(self, token): def startTagFromHead(self, token): self.parser.parseError("unexpected-start-tag-out-of-my-head", - {"name": token["name"]}) + {"name": token["name"]}) self.tree.openElements.append(self.tree.headPointer) self.parser.phases["inHead"].processStartTag(token) for node in self.tree.openElements[::-1]: @@ -849,7 +850,7 @@ def startTagFromHead(self, token): break def startTagHead(self, token): - self.parser.parseError("unexpected-start-tag", {"name":token["name"]}) + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) def startTagOther(self, token): self.anythingElse() @@ -860,21 +861,20 @@ def endTagHtmlBodyBr(self, token): return token def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name":token["name"]}) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True - class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # the really-really-really-very crazy mode def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - #Keep a ref to this for special handling of whitespace in
+            # Keep a ref to this for special handling of whitespace in 
             self.processSpaceCharactersNonPre = self.processSpaceCharacters
 
             self.startTagHandler = utils.MethodDispatcher([
@@ -888,15 +888,15 @@ def __init__(self, parser, tree):
                   "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
                   "footer", "header", "hgroup", "menu", "nav", "ol", "p",
                   "section", "summary", "ul"),
-                  self.startTagCloseP),
+                 self.startTagCloseP),
                 (headingElements, self.startTagHeading),
                 (("pre", "listing"), self.startTagPreListing),
                 ("form", self.startTagForm),
                 (("li", "dd", "dt"), self.startTagListItem),
-                ("plaintext",self.startTagPlaintext),
+                ("plaintext", self.startTagPlaintext),
                 ("a", self.startTagA),
                 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
-                  "strong", "tt", "u"),self.startTagFormatting),
+                  "strong", "tt", "u"), self.startTagFormatting),
                 ("nobr", self.startTagNobr),
                 ("button", self.startTagButton),
                 (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
@@ -924,21 +924,21 @@ def __init__(self, parser, tree):
             self.startTagHandler.default = self.startTagOther
 
             self.endTagHandler = utils.MethodDispatcher([
-                ("body",self.endTagBody),
-                ("html",self.endTagHtml),
+                ("body", self.endTagBody),
+                ("html", self.endTagHtml),
                 (("address", "article", "aside", "blockquote", "center",
                   "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
                   "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre",
                   "section", "summary", "ul"), self.endTagBlock),
                 ("form", self.endTagForm),
-                ("p",self.endTagP),
+                ("p", self.endTagP),
                 (("dd", "dt", "li"), self.endTagListItem),
                 (headingElements, self.endTagHeading),
                 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
                   "strike", "strong", "tt", "u"), self.endTagFormatting),
-                (("applet",  "marquee", "object"), self.endTagAppletMarqueeObject),
+                (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
                 ("br", self.endTagBr),
-                ])
+            ])
             self.endTagHandler.default = self.endTagOther
 
         def isMatchingFormattingElement(self, node1, node2):
@@ -980,7 +980,7 @@ def processEOF(self):
                 if node.name not in allowed_elements:
                     self.parser.parseError("expected-closing-tag-but-got-eof")
                     break
-            #Stop parsing
+            # Stop parsing
 
         def processSpaceCharactersDropNewline(self, token):
             # Sometimes (start of 
, , and