diff --git a/.travis.yml b/.travis.yml index a48d27f5..66d92deb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,18 @@ env: - USE_OPTIONAL=true - USE_OPTIONAL=false +matrix: + exclude: + - python: "2.7" + env: USE_OPTIONAL=false + - python: "3.3" + env: USE_OPTIONAL=false + include: + - python: "2.7" + env: USE_OPTIONAL=false FLAKE=true + - python: "3.3" + env: USE_OPTIONAL=false FLAKE=true + before_install: - git submodule update --init --recursive @@ -19,9 +31,12 @@ install: - if [[ $TRAVIS_PYTHON_VERSION != 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi - if [[ $TRAVIS_PYTHON_VERSION == 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi + - if [[ $FLAKE == "true" ]]; then pip install --use-mirrors flake8; fi script: - nosetests + - if [[ $FLAKE == "true" ]]; then find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501; fi + - if [[ $FLAKE == "true" ]]; then flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py; fi after_script: - python debug-info.py diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 528da9fa..10e2b74c 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -18,4 +18,6 @@ from .treewalkers import getTreeWalker from .serializer import serialize +__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", + "getTreeWalker", "serialize"] __version__ = "1.0b1" diff --git a/html5lib/constants.py b/html5lib/constants.py index 952fef41..1866dd78 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -1,300 +1,301 @@ from __future__ import absolute_import, division, unicode_literals -import string, gettext +import string +import gettext _ = gettext.gettext EOF = None E = { "null-character": - _("Null character in input stream, replaced with U+FFFD."), + _("Null character in input stream, replaced with U+FFFD."), "invalid-codepoint": - _("Invalid codepoint in stream."), + _("Invalid codepoint in stream."), "incorrectly-placed-solidus": - _("Solidus (/) incorrectly placed in tag."), + _("Solidus (/) incorrectly placed in tag."), "incorrect-cr-newline-entity": - _("Incorrect CR newline entity, replaced with LF."), + _("Incorrect CR newline entity, replaced with LF."), "illegal-windows-1252-entity": - _("Entity used with illegal number (windows-1252 reference)."), + _("Entity used with illegal number (windows-1252 reference)."), "cant-convert-numeric-entity": - _("Numeric entity couldn't be converted to character " - "(codepoint U+%(charAsInt)08x)."), + _("Numeric entity couldn't be converted to character " + "(codepoint U+%(charAsInt)08x)."), "illegal-codepoint-for-numeric-entity": - _("Numeric entity represents an illegal codepoint: " - "U+%(charAsInt)08x."), + _("Numeric entity represents an illegal codepoint: " + "U+%(charAsInt)08x."), "numeric-entity-without-semicolon": - _("Numeric entity didn't end with ';'."), + _("Numeric entity didn't end with ';'."), "expected-numeric-entity-but-got-eof": - _("Numeric entity expected. Got end of file instead."), + _("Numeric entity expected. Got end of file instead."), "expected-numeric-entity": - _("Numeric entity expected but none found."), + _("Numeric entity expected but none found."), "named-entity-without-semicolon": - _("Named entity didn't end with ';'."), + _("Named entity didn't end with ';'."), "expected-named-entity": - _("Named entity expected. Got none."), + _("Named entity expected. Got none."), "attributes-in-end-tag": - _("End tag contains unexpected attributes."), + _("End tag contains unexpected attributes."), 'self-closing-flag-on-end-tag': _("End tag contains unexpected self-closing flag."), "expected-tag-name-but-got-right-bracket": - _("Expected tag name. Got '>' instead."), + _("Expected tag name. Got '>' instead."), "expected-tag-name-but-got-question-mark": - _("Expected tag name. Got '?' instead. (HTML doesn't " - "support processing instructions.)"), + _("Expected tag name. Got '?' instead. (HTML doesn't " + "support processing instructions.)"), "expected-tag-name": - _("Expected tag name. Got something else instead"), + _("Expected tag name. Got something else instead"), "expected-closing-tag-but-got-right-bracket": - _("Expected closing tag. Got '>' instead. Ignoring '>'."), + _("Expected closing tag. Got '>' instead. Ignoring '>'."), "expected-closing-tag-but-got-eof": - _("Expected closing tag. Unexpected end of file."), + _("Expected closing tag. Unexpected end of file."), "expected-closing-tag-but-got-char": - _("Expected closing tag. Unexpected character '%(data)s' found."), + _("Expected closing tag. Unexpected character '%(data)s' found."), "eof-in-tag-name": - _("Unexpected end of file in the tag name."), + _("Unexpected end of file in the tag name."), "expected-attribute-name-but-got-eof": - _("Unexpected end of file. Expected attribute name instead."), + _("Unexpected end of file. Expected attribute name instead."), "eof-in-attribute-name": - _("Unexpected end of file in attribute name."), + _("Unexpected end of file in attribute name."), "invalid-character-in-attribute-name": _("Invalid character in attribute name"), "duplicate-attribute": - _("Dropped duplicate attribute on tag."), + _("Dropped duplicate attribute on tag."), "expected-end-of-tag-name-but-got-eof": - _("Unexpected end of file. Expected = or end of tag."), + _("Unexpected end of file. Expected = or end of tag."), "expected-attribute-value-but-got-eof": - _("Unexpected end of file. Expected attribute value."), + _("Unexpected end of file. Expected attribute value."), "expected-attribute-value-but-got-right-bracket": - _("Expected attribute value. Got '>' instead."), + _("Expected attribute value. Got '>' instead."), 'equals-in-unquoted-attribute-value': _("Unexpected = in unquoted attribute"), 'unexpected-character-in-unquoted-attribute-value': _("Unexpected character in unquoted attribute"), "invalid-character-after-attribute-name": - _("Unexpected character after attribute name."), + _("Unexpected character after attribute name."), "unexpected-character-after-attribute-value": - _("Unexpected character after attribute value."), + _("Unexpected character after attribute value."), "eof-in-attribute-value-double-quote": - _("Unexpected end of file in attribute value (\")."), + _("Unexpected end of file in attribute value (\")."), "eof-in-attribute-value-single-quote": - _("Unexpected end of file in attribute value (')."), + _("Unexpected end of file in attribute value (')."), "eof-in-attribute-value-no-quotes": - _("Unexpected end of file in attribute value."), + _("Unexpected end of file in attribute value."), "unexpected-EOF-after-solidus-in-tag": _("Unexpected end of file in tag. Expected >"), "unexpected-character-after-solidus-in-tag": _("Unexpected character after / in tag. Expected >"), "expected-dashes-or-doctype": - _("Expected '--' or 'DOCTYPE'. Not found."), + _("Expected '--' or 'DOCTYPE'. Not found."), "unexpected-bang-after-double-dash-in-comment": _("Unexpected ! after -- in comment"), "unexpected-space-after-double-dash-in-comment": _("Unexpected space after -- in comment"), "incorrect-comment": - _("Incorrect comment."), + _("Incorrect comment."), "eof-in-comment": - _("Unexpected end of file in comment."), + _("Unexpected end of file in comment."), "eof-in-comment-end-dash": - _("Unexpected end of file in comment (-)"), + _("Unexpected end of file in comment (-)"), "unexpected-dash-after-double-dash-in-comment": - _("Unexpected '-' after '--' found in comment."), + _("Unexpected '-' after '--' found in comment."), "eof-in-comment-double-dash": - _("Unexpected end of file in comment (--)."), + _("Unexpected end of file in comment (--)."), "eof-in-comment-end-space-state": - _("Unexpected end of file in comment."), + _("Unexpected end of file in comment."), "eof-in-comment-end-bang-state": - _("Unexpected end of file in comment."), + _("Unexpected end of file in comment."), "unexpected-char-in-comment": - _("Unexpected character in comment found."), + _("Unexpected character in comment found."), "need-space-after-doctype": - _("No space after literal string 'DOCTYPE'."), + _("No space after literal string 'DOCTYPE'."), "expected-doctype-name-but-got-right-bracket": - _("Unexpected > character. Expected DOCTYPE name."), + _("Unexpected > character. Expected DOCTYPE name."), "expected-doctype-name-but-got-eof": - _("Unexpected end of file. Expected DOCTYPE name."), + _("Unexpected end of file. Expected DOCTYPE name."), "eof-in-doctype-name": - _("Unexpected end of file in DOCTYPE name."), + _("Unexpected end of file in DOCTYPE name."), "eof-in-doctype": - _("Unexpected end of file in DOCTYPE."), + _("Unexpected end of file in DOCTYPE."), "expected-space-or-right-bracket-in-doctype": - _("Expected space or '>'. Got '%(data)s'"), + _("Expected space or '>'. Got '%(data)s'"), "unexpected-end-of-doctype": - _("Unexpected end of DOCTYPE."), + _("Unexpected end of DOCTYPE."), "unexpected-char-in-doctype": - _("Unexpected character in DOCTYPE."), + _("Unexpected character in DOCTYPE."), "eof-in-innerhtml": - _("XXX innerHTML EOF"), + _("XXX innerHTML EOF"), "unexpected-doctype": - _("Unexpected DOCTYPE. Ignored."), + _("Unexpected DOCTYPE. Ignored."), "non-html-root": - _("html needs to be the first start tag."), + _("html needs to be the first start tag."), "expected-doctype-but-got-eof": - _("Unexpected End of file. Expected DOCTYPE."), + _("Unexpected End of file. Expected DOCTYPE."), "unknown-doctype": - _("Erroneous DOCTYPE."), + _("Erroneous DOCTYPE."), "expected-doctype-but-got-chars": - _("Unexpected non-space characters. Expected DOCTYPE."), + _("Unexpected non-space characters. Expected DOCTYPE."), "expected-doctype-but-got-start-tag": - _("Unexpected start tag (%(name)s). Expected DOCTYPE."), + _("Unexpected start tag (%(name)s). Expected DOCTYPE."), "expected-doctype-but-got-end-tag": - _("Unexpected end tag (%(name)s). Expected DOCTYPE."), + _("Unexpected end tag (%(name)s). Expected DOCTYPE."), "end-tag-after-implied-root": - _("Unexpected end tag (%(name)s) after the (implied) root element."), + _("Unexpected end tag (%(name)s) after the (implied) root element."), "expected-named-closing-tag-but-got-eof": - _("Unexpected end of file. Expected end tag (%(name)s)."), + _("Unexpected end of file. Expected end tag (%(name)s)."), "two-heads-are-not-better-than-one": - _("Unexpected start tag head in existing head. Ignored."), + _("Unexpected start tag head in existing head. Ignored."), "unexpected-end-tag": - _("Unexpected end tag (%(name)s). Ignored."), + _("Unexpected end tag (%(name)s). Ignored."), "unexpected-start-tag-out-of-my-head": - _("Unexpected start tag (%(name)s) that can be in head. Moved."), + _("Unexpected start tag (%(name)s) that can be in head. Moved."), "unexpected-start-tag": - _("Unexpected start tag (%(name)s)."), + _("Unexpected start tag (%(name)s)."), "missing-end-tag": - _("Missing end tag (%(name)s)."), + _("Missing end tag (%(name)s)."), "missing-end-tags": - _("Missing end tags (%(name)s)."), + _("Missing end tags (%(name)s)."), "unexpected-start-tag-implies-end-tag": - _("Unexpected start tag (%(startName)s) " - "implies end tag (%(endName)s)."), + _("Unexpected start tag (%(startName)s) " + "implies end tag (%(endName)s)."), "unexpected-start-tag-treated-as": - _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), "deprecated-tag": - _("Unexpected start tag %(name)s. Don't use it!"), + _("Unexpected start tag %(name)s. Don't use it!"), "unexpected-start-tag-ignored": - _("Unexpected start tag %(name)s. Ignored."), + _("Unexpected start tag %(name)s. Ignored."), "expected-one-end-tag-but-got-another": - _("Unexpected end tag (%(gotName)s). " - "Missing end tag (%(expectedName)s)."), + _("Unexpected end tag (%(gotName)s). " + "Missing end tag (%(expectedName)s)."), "end-tag-too-early": - _("End tag (%(name)s) seen too early. Expected other end tag."), + _("End tag (%(name)s) seen too early. Expected other end tag."), "end-tag-too-early-named": - _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), "end-tag-too-early-ignored": - _("End tag (%(name)s) seen too early. Ignored."), + _("End tag (%(name)s) seen too early. Ignored."), "adoption-agency-1.1": - _("End tag (%(name)s) violates step 1, " - "paragraph 1 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 1, " + "paragraph 1 of the adoption agency algorithm."), "adoption-agency-1.2": - _("End tag (%(name)s) violates step 1, " - "paragraph 2 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 1, " + "paragraph 2 of the adoption agency algorithm."), "adoption-agency-1.3": - _("End tag (%(name)s) violates step 1, " - "paragraph 3 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 1, " + "paragraph 3 of the adoption agency algorithm."), "adoption-agency-4.4": - _("End tag (%(name)s) violates step 4, " - "paragraph 4 of the adoption agency algorithm."), + _("End tag (%(name)s) violates step 4, " + "paragraph 4 of the adoption agency algorithm."), "unexpected-end-tag-treated-as": - _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), "no-end-tag": - _("This element (%(name)s) has no end tag."), + _("This element (%(name)s) has no end tag."), "unexpected-implied-end-tag-in-table": - _("Unexpected implied end tag (%(name)s) in the table phase."), + _("Unexpected implied end tag (%(name)s) in the table phase."), "unexpected-implied-end-tag-in-table-body": - _("Unexpected implied end tag (%(name)s) in the table body phase."), + _("Unexpected implied end tag (%(name)s) in the table body phase."), "unexpected-char-implies-table-voodoo": - _("Unexpected non-space characters in " - "table context caused voodoo mode."), + _("Unexpected non-space characters in " + "table context caused voodoo mode."), "unexpected-hidden-input-in-table": - _("Unexpected input with type hidden in table context."), + _("Unexpected input with type hidden in table context."), "unexpected-form-in-table": - _("Unexpected form in table context."), + _("Unexpected form in table context."), "unexpected-start-tag-implies-table-voodoo": - _("Unexpected start tag (%(name)s) in " - "table context caused voodoo mode."), + _("Unexpected start tag (%(name)s) in " + "table context caused voodoo mode."), "unexpected-end-tag-implies-table-voodoo": - _("Unexpected end tag (%(name)s) in " - "table context caused voodoo mode."), + _("Unexpected end tag (%(name)s) in " + "table context caused voodoo mode."), "unexpected-cell-in-table-body": - _("Unexpected table cell start tag (%(name)s) " - "in the table body phase."), + _("Unexpected table cell start tag (%(name)s) " + "in the table body phase."), "unexpected-cell-end-tag": - _("Got table cell end tag (%(name)s) " - "while required end tags are missing."), + _("Got table cell end tag (%(name)s) " + "while required end tags are missing."), "unexpected-end-tag-in-table-body": - _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), + _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), "unexpected-implied-end-tag-in-table-row": - _("Unexpected implied end tag (%(name)s) in the table row phase."), + _("Unexpected implied end tag (%(name)s) in the table row phase."), "unexpected-end-tag-in-table-row": - _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), + _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), "unexpected-select-in-select": - _("Unexpected select start tag in the select phase " - "treated as select end tag."), + _("Unexpected select start tag in the select phase " + "treated as select end tag."), "unexpected-input-in-select": - _("Unexpected input start tag in the select phase."), + _("Unexpected input start tag in the select phase."), "unexpected-start-tag-in-select": - _("Unexpected start tag token (%(name)s in the select phase. " - "Ignored."), + _("Unexpected start tag token (%(name)s in the select phase. " + "Ignored."), "unexpected-end-tag-in-select": - _("Unexpected end tag (%(name)s) in the select phase. Ignored."), + _("Unexpected end tag (%(name)s) in the select phase. Ignored."), "unexpected-table-element-start-tag-in-select-in-table": - _("Unexpected table element start tag (%(name)s) in the select in table phase."), + _("Unexpected table element start tag (%(name)s) in the select in table phase."), "unexpected-table-element-end-tag-in-select-in-table": - _("Unexpected table element end tag (%(name)s) in the select in table phase."), + _("Unexpected table element end tag (%(name)s) in the select in table phase."), "unexpected-char-after-body": - _("Unexpected non-space characters in the after body phase."), + _("Unexpected non-space characters in the after body phase."), "unexpected-start-tag-after-body": - _("Unexpected start tag token (%(name)s)" - " in the after body phase."), + _("Unexpected start tag token (%(name)s)" + " in the after body phase."), "unexpected-end-tag-after-body": - _("Unexpected end tag token (%(name)s)" - " in the after body phase."), + _("Unexpected end tag token (%(name)s)" + " in the after body phase."), "unexpected-char-in-frameset": - _("Unexpected characters in the frameset phase. Characters ignored."), + _("Unexpected characters in the frameset phase. Characters ignored."), "unexpected-start-tag-in-frameset": - _("Unexpected start tag token (%(name)s)" - " in the frameset phase. Ignored."), + _("Unexpected start tag token (%(name)s)" + " in the frameset phase. Ignored."), "unexpected-frameset-in-frameset-innerhtml": - _("Unexpected end tag token (frameset) " - "in the frameset phase (innerHTML)."), + _("Unexpected end tag token (frameset) " + "in the frameset phase (innerHTML)."), "unexpected-end-tag-in-frameset": - _("Unexpected end tag token (%(name)s)" - " in the frameset phase. Ignored."), + _("Unexpected end tag token (%(name)s)" + " in the frameset phase. Ignored."), "unexpected-char-after-frameset": - _("Unexpected non-space characters in the " - "after frameset phase. Ignored."), + _("Unexpected non-space characters in the " + "after frameset phase. Ignored."), "unexpected-start-tag-after-frameset": - _("Unexpected start tag (%(name)s)" - " in the after frameset phase. Ignored."), + _("Unexpected start tag (%(name)s)" + " in the after frameset phase. Ignored."), "unexpected-end-tag-after-frameset": - _("Unexpected end tag (%(name)s)" - " in the after frameset phase. Ignored."), + _("Unexpected end tag (%(name)s)" + " in the after frameset phase. Ignored."), "unexpected-end-tag-after-body-innerhtml": - _("Unexpected end tag after body(innerHtml)"), + _("Unexpected end tag after body(innerHtml)"), "expected-eof-but-got-char": - _("Unexpected non-space characters. Expected end of file."), + _("Unexpected non-space characters. Expected end of file."), "expected-eof-but-got-start-tag": - _("Unexpected start tag (%(name)s)" - ". Expected end of file."), + _("Unexpected start tag (%(name)s)" + ". Expected end of file."), "expected-eof-but-got-end-tag": - _("Unexpected end tag (%(name)s)" - ". Expected end of file."), + _("Unexpected end tag (%(name)s)" + ". Expected end of file."), "eof-in-table": - _("Unexpected end of file. Expected table content."), + _("Unexpected end of file. Expected table content."), "eof-in-select": - _("Unexpected end of file. Expected select content."), + _("Unexpected end of file. Expected select content."), "eof-in-frameset": - _("Unexpected end of file. Expected frameset content."), + _("Unexpected end of file. Expected frameset content."), "eof-in-script-in-script": - _("Unexpected end of file. Expected script content."), + _("Unexpected end of file. Expected script content."), "eof-in-foreign-lands": - _("Unexpected end of file. Expected foreign content"), + _("Unexpected end of file. Expected foreign content"), "non-void-element-with-trailing-solidus": - _("Trailing solidus not allowed on element %(name)s"), + _("Trailing solidus not allowed on element %(name)s"), "unexpected-html-element-in-foreign-content": - _("Element %(name)s not allowed in a non-html context"), + _("Element %(name)s not allowed in a non-html context"), "unexpected-end-tag-before-html": _("Unexpected end tag (%(name)s) before html."), "XXX-undefined-error": - ("Undefined error (this sucks and should be fixed)"), + _("Undefined error (this sucks and should be fixed)"), } namespaces = { - "html":"http://www.w3.org/1999/xhtml", - "mathml":"http://www.w3.org/1998/Math/MathML", - "svg":"http://www.w3.org/2000/svg", - "xlink":"http://www.w3.org/1999/xlink", - "xml":"http://www.w3.org/XML/1998/namespace", - "xmlns":"http://www.w3.org/2000/xmlns/" + "html": "http://www.w3.org/1999/xhtml", + "mathml": "http://www.w3.org/1998/Math/MathML", + "svg": "http://www.w3.org/2000/svg", + "xlink": "http://www.w3.org/1999/xlink", + "xml": "http://www.w3.org/XML/1998/namespace", + "xmlns": "http://www.w3.org/2000/xmlns/" } scopingElements = frozenset(( @@ -454,8 +455,8 @@ digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) -asciiUpper2Lower = dict([(ord(c),ord(c.lower())) - for c in string.ascii_uppercase]) +asciiUpper2Lower = dict([(ord(c), ord(c.lower())) + for c in string.ascii_uppercase]) # Heading elements need to be ordered headingElements = ( @@ -501,8 +502,8 @@ "": frozenset(("irrelevant",)), "style": frozenset(("scoped",)), "img": frozenset(("ismap",)), - "audio": frozenset(("autoplay","controls")), - "video": frozenset(("autoplay","controls")), + "audio": frozenset(("autoplay", "controls")), + "video": frozenset(("autoplay", "controls")), "script": frozenset(("defer", "async")), "details": frozenset(("open",)), "datagrid": frozenset(("multiple", "disabled")), @@ -521,38 +522,38 @@ # entitiesWindows1252 has to be _ordered_ and needs to have an index. It # therefore can't be a frozenset. entitiesWindows1252 = ( - 8364, # 0x80 0x20AC EURO SIGN - 65533, # 0x81 UNDEFINED - 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK - 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK - 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK - 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS - 8224, # 0x86 0x2020 DAGGER - 8225, # 0x87 0x2021 DOUBLE DAGGER - 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT - 8240, # 0x89 0x2030 PER MILLE SIGN - 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON - 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE - 65533, # 0x8D UNDEFINED - 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON - 65533, # 0x8F UNDEFINED - 65533, # 0x90 UNDEFINED - 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK - 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK - 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK - 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK - 8226, # 0x95 0x2022 BULLET - 8211, # 0x96 0x2013 EN DASH - 8212, # 0x97 0x2014 EM DASH - 732, # 0x98 0x02DC SMALL TILDE - 8482, # 0x99 0x2122 TRADE MARK SIGN - 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON - 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE - 65533, # 0x9D UNDEFINED - 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON - 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS + 8364, # 0x80 0x20AC EURO SIGN + 65533, # 0x81 UNDEFINED + 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK + 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK + 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK + 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS + 8224, # 0x86 0x2020 DAGGER + 8225, # 0x87 0x2021 DOUBLE DAGGER + 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT + 8240, # 0x89 0x2030 PER MILLE SIGN + 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON + 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE + 65533, # 0x8D UNDEFINED + 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON + 65533, # 0x8F UNDEFINED + 65533, # 0x90 UNDEFINED + 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK + 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK + 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK + 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK + 8226, # 0x95 0x2022 BULLET + 8211, # 0x96 0x2013 EN DASH + 8212, # 0x97 0x2014 EM DASH + 732, # 0x98 0x02DC SMALL TILDE + 8482, # 0x99 0x2122 TRADE MARK SIGN + 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON + 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE + 65533, # 0x9D UNDEFINED + 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON + 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) @@ -2792,41 +2793,41 @@ } replacementCharacters = { - 0x0:"\uFFFD", - 0x0d:"\u000D", - 0x80:"\u20AC", - 0x81:"\u0081", - 0x81:"\u0081", - 0x82:"\u201A", - 0x83:"\u0192", - 0x84:"\u201E", - 0x85:"\u2026", - 0x86:"\u2020", - 0x87:"\u2021", - 0x88:"\u02C6", - 0x89:"\u2030", - 0x8A:"\u0160", - 0x8B:"\u2039", - 0x8C:"\u0152", - 0x8D:"\u008D", - 0x8E:"\u017D", - 0x8F:"\u008F", - 0x90:"\u0090", - 0x91:"\u2018", - 0x92:"\u2019", - 0x93:"\u201C", - 0x94:"\u201D", - 0x95:"\u2022", - 0x96:"\u2013", - 0x97:"\u2014", - 0x98:"\u02DC", - 0x99:"\u2122", - 0x9A:"\u0161", - 0x9B:"\u203A", - 0x9C:"\u0153", - 0x9D:"\u009D", - 0x9E:"\u017E", - 0x9F:"\u0178", + 0x0: "\uFFFD", + 0x0d: "\u000D", + 0x80: "\u20AC", + 0x81: "\u0081", + 0x81: "\u0081", + 0x82: "\u201A", + 0x83: "\u0192", + 0x84: "\u201E", + 0x85: "\u2026", + 0x86: "\u2020", + 0x87: "\u2021", + 0x88: "\u02C6", + 0x89: "\u2030", + 0x8A: "\u0160", + 0x8B: "\u2039", + 0x8C: "\u0152", + 0x8D: "\u008D", + 0x8E: "\u017D", + 0x8F: "\u008F", + 0x90: "\u0090", + 0x91: "\u2018", + 0x92: "\u2019", + 0x93: "\u201C", + 0x94: "\u201D", + 0x95: "\u2022", + 0x96: "\u2013", + 0x97: "\u2014", + 0x98: "\u02DC", + 0x99: "\u2122", + 0x9A: "\u0161", + 0x9B: "\u203A", + 0x9C: "\u0153", + 0x9D: "\u009D", + 0x9E: "\u017E", + 0x9F: "\u0178", } encodings = { @@ -3059,25 +3060,27 @@ 'x-x-big5': 'big5'} tokenTypes = { - "Doctype":0, - "Characters":1, - "SpaceCharacters":2, - "StartTag":3, - "EndTag":4, - "EmptyTag":5, - "Comment":6, - "ParseError":7 + "Doctype": 0, + "Characters": 1, + "SpaceCharacters": 2, + "StartTag": 3, + "EndTag": 4, + "EmptyTag": 5, + "Comment": 6, + "ParseError": 7 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"])) -prefixes = dict([(v,k) for k,v in namespaces.items()]) +prefixes = dict([(v, k) for k, v in namespaces.items()]) prefixes["http://www.w3.org/1998/Math/MathML"] = "math" + class DataLossWarning(UserWarning): pass + class ReparseException(Exception): pass diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py index 65a3e902..ca33b70b 100644 --- a/html5lib/filters/inject_meta_charset.py +++ b/html5lib/filters/inject_meta_charset.py @@ -2,6 +2,7 @@ from . import _base + class Filter(_base.Filter): def __init__(self, source, encoding): _base.Filter.__init__(self, source) @@ -20,21 +21,21 @@ def __iter__(self): elif type == "EmptyTag": if token["name"].lower() == "meta": - # replace charset with actual encoding - has_http_equiv_content_type = False - for (namespace,name),value in token["data"].items(): - if namespace != None: - continue - elif name.lower() == 'charset': - token["data"][(namespace,name)] = self.encoding - meta_found = True - break - elif name == 'http-equiv' and value.lower() == 'content-type': - has_http_equiv_content_type = True - else: - if has_http_equiv_content_type and (None, "content") in token["data"]: - token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding - meta_found = True + # replace charset with actual encoding + has_http_equiv_content_type = False + for (namespace, name), value in token["data"].items(): + if namespace is not None: + continue + elif name.lower() == 'charset': + token["data"][(namespace, name)] = self.encoding + meta_found = True + break + elif name == 'http-equiv' and value.lower() == 'content-type': + has_http_equiv_content_type = True + else: + if has_http_equiv_content_type and (None, "content") in token["data"]: + token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding + meta_found = True elif token["name"].lower() == "head" and not meta_found: # insert meta into empty head diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index bf98708d..d6f37cf4 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -9,7 +9,10 @@ from html5lib.constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -class LintError(Exception): pass + +class LintError(Exception): + pass + class Filter(_base.Filter): def __iter__(self): diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py index 39d93ea5..fefe0b30 100644 --- a/html5lib/filters/optionaltags.py +++ b/html5lib/filters/optionaltags.py @@ -2,6 +2,7 @@ from . import _base + class Filter(_base.Filter): def slider(self): previous1 = previous2 = None @@ -17,7 +18,7 @@ def __iter__(self): type = token["type"] if type == "StartTag": if (token["data"] or - not self.is_optional_start(token["name"], previous, next)): + not self.is_optional_start(token["name"], previous, next)): yield token elif type == "EndTag": if not self.is_optional_end(token["name"], next): @@ -75,7 +76,7 @@ def is_optional_start(self, tagname, previous, next): # omit the thead and tfoot elements' end tag when they are # immediately followed by a tbody element. See is_optional_end. if previous and previous['type'] == 'EndTag' and \ - previous['name'] in ('tbody','thead','tfoot'): + previous['name'] in ('tbody', 'thead', 'tfoot'): return False return next["name"] == 'tr' else: diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index adaee595..2692023d 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -3,8 +3,10 @@ from . import _base from html5lib.sanitizer import HTMLSanitizerMixin + class Filter(_base.Filter, HTMLSanitizerMixin): def __iter__(self): for token in _base.Filter.__iter__(self): token = self.sanitize_token(token) - if token: yield token + if token: + yield token diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py index c2b7fb12..1f309236 100644 --- a/html5lib/filters/whitespace.py +++ b/html5lib/filters/whitespace.py @@ -8,6 +8,7 @@ SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) + class Filter(_base.Filter): spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) @@ -17,7 +18,7 @@ def __iter__(self): for token in _base.Filter.__iter__(self): type = token["type"] if type == "StartTag" \ - and (preserve or token["name"] in self.spacePreserveElements): + and (preserve or token["name"] in self.spacePreserveElements): preserve += 1 elif type == "EndTag" and preserve: @@ -32,6 +33,6 @@ def __iter__(self): yield token + def collapse_spaces(text): return SPACES_REGEX.sub(' ', text) - diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 989691a4..dab175dd 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,7 +1,6 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass -import sys import types from . import inputstream @@ -14,12 +13,13 @@ from . import utils from . import constants from .constants import spaceCharacters, asciiUpper2Lower -from .constants import formattingElements, specialElements -from .constants import headingElements, tableInsertModeElements -from .constants import cdataElements, rcdataElements, voidElements -from .constants import tokenTypes, ReparseException, namespaces, spaceCharacters +from .constants import specialElements +from .constants import headingElements +from .constants import cdataElements, rcdataElements +from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements + def parse(doc, treebuilder="simpletree", encoding=None, namespaceHTMLElements=True): """Parse a string or file-like object into a tree""" @@ -27,30 +27,33 @@ def parse(doc, treebuilder="simpletree", encoding=None, p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parse(doc, encoding=encoding) + def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None, namespaceHTMLElements=True): tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parseFragment(doc, container=container, encoding=encoding) + def method_decorator_metaclass(function): class Decorated(type): def __new__(meta, classname, bases, classDict): for attributeName, attribute in classDict.items(): - if type(attribute) == types.FunctionType: + if isinstance(attribute, types.FunctionType): attribute = function(attribute) classDict[attributeName] = attribute - return type.__new__(meta, classname, bases, classDict) + return type.__new__(meta, classname, bases, classDict) return Decorated + class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" - def __init__(self, tree = simpletree.TreeBuilder, - tokenizer = tokenizer.HTMLTokenizer, strict = False, - namespaceHTMLElements = True, debug=False): + def __init__(self, tree=simpletree.TreeBuilder, + tokenizer=tokenizer.HTMLTokenizer, strict=False, + namespaceHTMLElements=True, debug=False): """ strict - raise an exception when a parse error is encountered @@ -88,14 +91,14 @@ def _parse(self, stream, innerHTML=False, container="div", try: self.mainLoop() break - except ReparseException as e: + except ReparseException: self.reset() def reset(self): self.tree.reset() self.firstStartTag = False self.errors = [] - self.log = [] #only used with debug mode + self.log = [] # only used with debug mode # "quirks" / "limited quirks" / "no quirks" self.compatMode = "no quirks" @@ -127,7 +130,7 @@ def reset(self): def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and - element.namespace == namespaces["mathml"]): + element.namespace == namespaces["mathml"]): return ("encoding" in element.attributes and element.attributes["encoding"].translate( asciiUpper2Lower) in @@ -178,7 +181,7 @@ def mainLoop(self): if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: - new_token= phase.processSpaceCharacters(new_token) + new_token = phase.processSpaceCharacters(new_token) elif type == StartTagToken: new_token = phase.processStartTag(new_token) elif type == EndTagToken: @@ -189,10 +192,9 @@ def mainLoop(self): new_token = phase.processDoctype(new_token) if (type == StartTagToken and token["selfClosing"] - and not token["selfClosingAcknowledged"]): + and not token["selfClosingAcknowledged"]): self.parseError("non-void-element-with-trailing-solidus", - {"name":token["name"]}) - + {"name": token["name"]}) # When the loop finishes it's EOF reprocess = True @@ -253,77 +255,77 @@ def normalizeToken(self, token): return token def adjustMathMLAttributes(self, token): - replacements = {"definitionurl":"definitionURL"} - for k,v in replacements.items(): + replacements = {"definitionurl": "definitionURL"} + for k, v in replacements.items(): if k in token["data"]: token["data"][v] = token["data"][k] del token["data"][k] def adjustSVGAttributes(self, token): replacements = { - "attributename":"attributeName", - "attributetype":"attributeType", - "basefrequency":"baseFrequency", - "baseprofile":"baseProfile", - "calcmode":"calcMode", - "clippathunits":"clipPathUnits", - "contentscripttype":"contentScriptType", - "contentstyletype":"contentStyleType", - "diffuseconstant":"diffuseConstant", - "edgemode":"edgeMode", - "externalresourcesrequired":"externalResourcesRequired", - "filterres":"filterRes", - "filterunits":"filterUnits", - "glyphref":"glyphRef", - "gradienttransform":"gradientTransform", - "gradientunits":"gradientUnits", - "kernelmatrix":"kernelMatrix", - "kernelunitlength":"kernelUnitLength", - "keypoints":"keyPoints", - "keysplines":"keySplines", - "keytimes":"keyTimes", - "lengthadjust":"lengthAdjust", - "limitingconeangle":"limitingConeAngle", - "markerheight":"markerHeight", - "markerunits":"markerUnits", - "markerwidth":"markerWidth", - "maskcontentunits":"maskContentUnits", - "maskunits":"maskUnits", - "numoctaves":"numOctaves", - "pathlength":"pathLength", - "patterncontentunits":"patternContentUnits", - "patterntransform":"patternTransform", - "patternunits":"patternUnits", - "pointsatx":"pointsAtX", - "pointsaty":"pointsAtY", - "pointsatz":"pointsAtZ", - "preservealpha":"preserveAlpha", - "preserveaspectratio":"preserveAspectRatio", - "primitiveunits":"primitiveUnits", - "refx":"refX", - "refy":"refY", - "repeatcount":"repeatCount", - "repeatdur":"repeatDur", - "requiredextensions":"requiredExtensions", - "requiredfeatures":"requiredFeatures", - "specularconstant":"specularConstant", - "specularexponent":"specularExponent", - "spreadmethod":"spreadMethod", - "startoffset":"startOffset", - "stddeviation":"stdDeviation", - "stitchtiles":"stitchTiles", - "surfacescale":"surfaceScale", - "systemlanguage":"systemLanguage", - "tablevalues":"tableValues", - "targetx":"targetX", - "targety":"targetY", - "textlength":"textLength", - "viewbox":"viewBox", - "viewtarget":"viewTarget", - "xchannelselector":"xChannelSelector", - "ychannelselector":"yChannelSelector", - "zoomandpan":"zoomAndPan" - } + "attributename": "attributeName", + "attributetype": "attributeType", + "basefrequency": "baseFrequency", + "baseprofile": "baseProfile", + "calcmode": "calcMode", + "clippathunits": "clipPathUnits", + "contentscripttype": "contentScriptType", + "contentstyletype": "contentStyleType", + "diffuseconstant": "diffuseConstant", + "edgemode": "edgeMode", + "externalresourcesrequired": "externalResourcesRequired", + "filterres": "filterRes", + "filterunits": "filterUnits", + "glyphref": "glyphRef", + "gradienttransform": "gradientTransform", + "gradientunits": "gradientUnits", + "kernelmatrix": "kernelMatrix", + "kernelunitlength": "kernelUnitLength", + "keypoints": "keyPoints", + "keysplines": "keySplines", + "keytimes": "keyTimes", + "lengthadjust": "lengthAdjust", + "limitingconeangle": "limitingConeAngle", + "markerheight": "markerHeight", + "markerunits": "markerUnits", + "markerwidth": "markerWidth", + "maskcontentunits": "maskContentUnits", + "maskunits": "maskUnits", + "numoctaves": "numOctaves", + "pathlength": "pathLength", + "patterncontentunits": "patternContentUnits", + "patterntransform": "patternTransform", + "patternunits": "patternUnits", + "pointsatx": "pointsAtX", + "pointsaty": "pointsAtY", + "pointsatz": "pointsAtZ", + "preservealpha": "preserveAlpha", + "preserveaspectratio": "preserveAspectRatio", + "primitiveunits": "primitiveUnits", + "refx": "refX", + "refy": "refY", + "repeatcount": "repeatCount", + "repeatdur": "repeatDur", + "requiredextensions": "requiredExtensions", + "requiredfeatures": "requiredFeatures", + "specularconstant": "specularConstant", + "specularexponent": "specularExponent", + "spreadmethod": "spreadMethod", + "startoffset": "startOffset", + "stddeviation": "stdDeviation", + "stitchtiles": "stitchTiles", + "surfacescale": "surfaceScale", + "systemlanguage": "systemLanguage", + "tablevalues": "tableValues", + "targetx": "targetX", + "targety": "targetY", + "textlength": "textLength", + "viewbox": "viewBox", + "viewtarget": "viewTarget", + "xchannelselector": "xChannelSelector", + "ychannelselector": "yChannelSelector", + "zoomandpan": "zoomAndPan" + } for originalName in list(token["data"].keys()): if originalName in replacements: svgName = replacements[originalName] @@ -332,19 +334,19 @@ def adjustSVGAttributes(self, token): def adjustForeignAttributes(self, token): replacements = { - "xlink:actuate":("xlink", "actuate", namespaces["xlink"]), - "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]), - "xlink:href":("xlink", "href", namespaces["xlink"]), - "xlink:role":("xlink", "role", namespaces["xlink"]), - "xlink:show":("xlink", "show", namespaces["xlink"]), - "xlink:title":("xlink", "title", namespaces["xlink"]), - "xlink:type":("xlink", "type", namespaces["xlink"]), - "xml:base":("xml", "base", namespaces["xml"]), - "xml:lang":("xml", "lang", namespaces["xml"]), - "xml:space":("xml", "space", namespaces["xml"]), - "xmlns":(None, "xmlns", namespaces["xmlns"]), - "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"]) - } + "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), + "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), + "xlink:href": ("xlink", "href", namespaces["xlink"]), + "xlink:role": ("xlink", "role", namespaces["xlink"]), + "xlink:show": ("xlink", "show", namespaces["xlink"]), + "xlink:title": ("xlink", "title", namespaces["xlink"]), + "xlink:type": ("xlink", "type", namespaces["xlink"]), + "xml:base": ("xml", "base", namespaces["xml"]), + "xml:lang": ("xml", "lang", namespaces["xml"]), + "xml:space": ("xml", "space", namespaces["xml"]), + "xmlns": (None, "xmlns", namespaces["xmlns"]), + "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"]) + } for originalName in token["data"].keys(): if originalName in replacements: @@ -360,20 +362,20 @@ def resetInsertionMode(self): # specification.) last = False newModes = { - "select":"inSelect", - "td":"inCell", - "th":"inCell", - "tr":"inRow", - "tbody":"inTableBody", - "thead":"inTableBody", - "tfoot":"inTableBody", - "caption":"inCaption", - "colgroup":"inColumnGroup", - "table":"inTable", - "head":"inBody", - "body":"inBody", - "frameset":"inFrameset", - "html":"beforeHead" + "select": "inSelect", + "td": "inCell", + "th": "inCell", + "tr": "inRow", + "tbody": "inTableBody", + "thead": "inTableBody", + "tfoot": "inTableBody", + "caption": "inCaption", + "colgroup": "inColumnGroup", + "table": "inTable", + "head": "inBody", + "body": "inBody", + "frameset": "inFrameset", + "html": "beforeHead" } for node in self.tree.openElements[::-1]: nodeName = node.name @@ -405,7 +407,7 @@ def parseRCDataRawtext(self, token, contentType): """ assert contentType in ("RAWTEXT", "RCDATA") - element = self.tree.insertElement(token) + self.tree.insertElement(token) if contentType == "RAWTEXT": self.tokenizer.state = self.tokenizer.rawtextState @@ -416,16 +418,18 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] + def getPhases(debug): def log(function): """Logger that records which phase processes each token""" type_names = dict((value, key) for key, value in constants.tokenTypes.items()) + def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: token = args[0] try: - info = {"type":type_names[token['type']]} + info = {"type": type_names[token['type']]} except: raise if token['type'] in constants.tagTokenTypes: @@ -476,8 +480,8 @@ def processStartTag(self, token): return self.startTagHandler[token["name"]](token) def startTagHtml(self, token): - if self.parser.firstStartTag == False and token["name"] == "html": - self.parser.parseError("non-html-root") + if not self.parser.firstStartTag and token["name"] == "html": + self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in token["data"].items(): @@ -501,8 +505,8 @@ def processDoctype(self, token): systemId = token["systemId"] correct = token["correct"] - if (name != "html" or publicId != None or - systemId != None and systemId != "about:legacy-compat"): + if (name != "html" or publicId is not None or + systemId is not None and systemId != "about:legacy-compat"): self.parser.parseError("unknown-doctype") if publicId is None: @@ -577,8 +581,8 @@ def processDoctype(self, token): or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and - systemId == None - or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + systemId is None + or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" elif (publicId.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", @@ -586,7 +590,7 @@ def processDoctype(self, token): or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and - systemId != None): + systemId is not None): self.parser.compatMode = "limited quirks" self.parser.phase = self.parser.phases["beforeHtml"] @@ -602,13 +606,13 @@ def processCharacters(self, token): def processStartTag(self, token): self.parser.parseError("expected-doctype-but-got-start-tag", - {"name": token["name"]}) + {"name": token["name"]}) self.anythingElse() return token def processEndTag(self, token): self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) + {"name": token["name"]}) self.anythingElse() return token @@ -617,7 +621,6 @@ def processEOF(self): self.anythingElse() return True - class BeforeHtmlPhase(Phase): # helper methods def insertHtmlElement(self): @@ -648,12 +651,11 @@ def processStartTag(self, token): def processEndTag(self, token): if token["name"] not in ("head", "body", "html", "br"): self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) + {"name": token["name"]}) else: self.insertHtmlElement() return token - class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) @@ -698,13 +700,13 @@ def endTagImplyHead(self, token): def endTagOther(self, token): self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) + {"name": token["name"]}) class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), @@ -723,7 +725,7 @@ def __init__(self, parser, tree): self.endTagHandler.default = self.endTagOther # the real thing - def processEOF (self): + def processEOF(self): self.anythingElse() return True @@ -767,7 +769,7 @@ def startTagTitle(self, token): self.parser.parseRCDataRawtext(token, "RCDATA") def startTagNoScriptNoFramesStyle(self, token): - #Need to decide whether to implement the scripting-disabled case + # Need to decide whether to implement the scripting-disabled case self.parser.parseRCDataRawtext(token, "RAWTEXT") def startTagScript(self, token): @@ -782,7 +784,7 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() - assert node.name == "head", "Expected head got %s"%node.name + assert node.name == "head", "Expected head got %s" % node.name self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -795,12 +797,10 @@ def endTagOther(self, token): def anythingElse(self): self.endTagHead(impliedTagToken("head")) - # XXX If we implement a parser for which scripting is disabled we need to # implement this phase. # # class InHeadNoScriptPhase(Phase): - class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) @@ -811,7 +811,7 @@ def __init__(self, parser, tree): ("frameset", self.startTagFrameset), (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title"), - self.startTagFromHead), + self.startTagFromHead), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther @@ -841,7 +841,7 @@ def startTagFrameset(self, token): def startTagFromHead(self, token): self.parser.parseError("unexpected-start-tag-out-of-my-head", - {"name": token["name"]}) + {"name": token["name"]}) self.tree.openElements.append(self.tree.headPointer) self.parser.phases["inHead"].processStartTag(token) for node in self.tree.openElements[::-1]: @@ -850,7 +850,7 @@ def startTagFromHead(self, token): break def startTagHead(self, token): - self.parser.parseError("unexpected-start-tag", {"name":token["name"]}) + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) def startTagOther(self, token): self.anythingElse() @@ -861,21 +861,20 @@ def endTagHtmlBodyBr(self, token): return token def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name":token["name"]}) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True - class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # the really-really-really-very crazy mode def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - #Keep a ref to this for special handling of whitespace in
+ # Keep a ref to this for special handling of whitespace inself.processSpaceCharactersNonPre = self.processSpaceCharacters self.startTagHandler = utils.MethodDispatcher([ @@ -889,15 +888,15 @@ def __init__(self, parser, tree): "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul"), - self.startTagCloseP), + self.startTagCloseP), (headingElements, self.startTagHeading), (("pre", "listing"), self.startTagPreListing), ("form", self.startTagForm), (("li", "dd", "dt"), self.startTagListItem), - ("plaintext",self.startTagPlaintext), + ("plaintext", self.startTagPlaintext), ("a", self.startTagA), (("b", "big", "code", "em", "font", "i", "s", "small", "strike", - "strong", "tt", "u"),self.startTagFormatting), + "strong", "tt", "u"), self.startTagFormatting), ("nobr", self.startTagNobr), ("button", self.startTagButton), (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), @@ -925,21 +924,21 @@ def __init__(self, parser, tree): self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ - ("body",self.endTagBody), - ("html",self.endTagHtml), + ("body", self.endTagBody), + ("html", self.endTagHtml), (("address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul"), self.endTagBlock), ("form", self.endTagForm), - ("p",self.endTagP), + ("p", self.endTagP), (("dd", "dt", "li"), self.endTagListItem), (headingElements, self.endTagHeading), (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"), self.endTagFormatting), - (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), + (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), ("br", self.endTagBr), - ]) + ]) self.endTagHandler.default = self.endTagOther def isMatchingFormattingElement(self, node1, node2): @@ -981,7 +980,7 @@ def processEOF(self): if node.name not in allowed_elements: self.parser.parseError("expected-closing-tag-but-got-eof") break - #Stop parsing + # Stop parsing def processSpaceCharactersDropNewline(self, token): # Sometimes (start of,, and