Skip to content

Selected patches from Calibre #245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 22, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,6 @@ functionality:
- ``chardet`` can be used as a fallback when character encoding cannot
be determined.

- ``ordereddict`` can be used under Python 2.6
(``collections.OrderedDict`` is used instead on later versions) to
serialize attributes in alphabetical order.


Bugs
----
Expand Down
67 changes: 67 additions & 0 deletions html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,73 @@
(namespaces["mathml"], "mtext")
])

adjustSVGAttributes = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}

adjustMathMLAttributes = {"definitionurl": "definitionURL"}

adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
Expand Down
141 changes: 36 additions & 105 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass
from six import with_metaclass, viewkeys, PY3

import types

try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict

from . import inputstream
from . import tokenizer

from . import treebuilders
from .treebuilders._base import Marker

from . import utils
from . import constants
from .constants import spaceCharacters, asciiUpper2Lower
from .constants import specialElements
from .constants import headingElements
from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
from .constants import E
from .constants import (
spaceCharacters, asciiUpper2Lower,
specialElements, headingElements, cdataElements, rcdataElements,
tokenTypes, tagTokenTypes,
namespaces,
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
adjustForeignAttributes as adjustForeignAttributesMap,
adjustMathMLAttributes, adjustSVGAttributes,
E,
ReparseException
)


def parse(doc, treebuilder="etree", encoding=None,
Expand Down Expand Up @@ -272,96 +279,18 @@ def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """

if token["type"] == tokenTypes["StartTag"]:
token["data"] = dict(token["data"][::-1])
token["data"] = OrderedDict(token['data'][::-1])

return token

def adjustMathMLAttributes(self, token):
replacements = {"definitionurl": "definitionURL"}
for k, v in replacements.items():
if k in token["data"]:
token["data"][v] = token["data"][k]
del token["data"][k]
adjust_attributes(token, adjustMathMLAttributes)

def adjustSVGAttributes(self, token):
replacements = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
for originalName in list(token["data"].keys()):
if originalName in replacements:
svgName = replacements[originalName]
token["data"][svgName] = token["data"][originalName]
del token["data"][originalName]
adjust_attributes(token, adjustSVGAttributes)

def adjustForeignAttributes(self, token):
replacements = adjustForeignAttributesMap

for originalName in token["data"].keys():
if originalName in replacements:
foreignName = replacements[originalName]
token["data"][foreignName] = token["data"][originalName]
del token["data"][originalName]
adjust_attributes(token, adjustForeignAttributesMap)

def reparseTokenNormal(self, token):
# pylint:disable=unused-argument
Expand Down Expand Up @@ -434,7 +363,7 @@ def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
constants.tokenTypes.items())
tokenTypes.items())

def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
Expand All @@ -443,7 +372,7 @@ def wrapped(self, *args, **kwargs):
info = {"type": type_names[token['type']]}
except:
raise
if token['type'] in constants.tagTokenTypes:
if token['type'] in tagTokenTypes:
info["name"] = token['name']

self.parser.log.append((self.parser.tokenizer.state.__name__,
Expand Down Expand Up @@ -1022,17 +951,9 @@ def __init__(self, parser, tree):
self.endTagHandler.default = self.endTagOther

def isMatchingFormattingElement(self, node1, node2):
if node1.name != node2.name or node1.namespace != node2.namespace:
return False
elif len(node1.attributes) != len(node2.attributes):
return False
else:
attributes1 = sorted(node1.attributes.items())
attributes2 = sorted(node2.attributes.items())
for attr1, attr2 in zip(attributes1, attributes2):
if attr1 != attr2:
return False
return True
return (node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)

# helper
def addFormattingElement(self, token):
Expand Down Expand Up @@ -2798,6 +2719,16 @@ def processEndTag(self, token):
}


def adjust_attributes(token, replacements):
if PY3 or utils.PY27:
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
else:
needs_adjustment = frozenset(token['data']) & frozenset(replacements)
if needs_adjustment:
token['data'] = OrderedDict((replacements.get(k, k), v)
for k, v in token['data'].items())


def impliedTagToken(name, type="EndTag", attributes=None,
selfClosing=False):
if attributes is None:
Expand Down
41 changes: 40 additions & 1 deletion html5lib/tests/test_parser2.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import absolute_import, division, unicode_literals

from six import PY2, text_type

import io

from . import support # noqa

from html5lib.constants import namespaces
from html5lib import parse
from html5lib import parse, HTMLParser


# tests that aren't autogenerated from text files
Expand Down Expand Up @@ -49,3 +51,40 @@ def test_namespace_html_elements_1_etree():

def test_unicode_file():
assert parse(io.StringIO("a")) is not None


def test_duplicate_attribute():
# This is here because we impl it in parser and not tokenizer
doc = parse('<p class=a class=b>')
el = doc[1][0]
assert el.get("class") == "a"


def test_debug_log():
parser = HTMLParser(debug=True)
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]

if PY2:
for i, log in enumerate(expected):
log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
expected[i] = tuple(log)

assert parser.log == expected
Loading