Skip to content

Document html5lib.treewalkers #386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 26 additions & 15 deletions html5lib/treewalkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,25 @@
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support

Args:
treeType (str): the name of the tree type required (case-insensitive).
Supported values are:

- "dom": The xml.dom.minidom DOM implementation
- "etree": A generic walker for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
- "lxml": Optimized walker for lxml.etree
- "genshi": a Genshi stream

Implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the
"etree" tree type only).
:arg str treeType: the name of the tree type required (case-insensitive).
Supported values are:

* "dom": The xml.dom.minidom DOM implementation
* "etree": A generic walker for tree implementations exposing an
elementtree-like interface (known to work with ElementTree,
cElementTree and lxml.etree).
* "lxml": Optimized walker for lxml.etree
* "genshi": a Genshi stream

:arg implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
tree type only).

:arg kwargs: keyword arguments passed to the etree walker--for other
walkers, this has no effect

:returns: a TreeWalker class

"""

treeType = treeType.lower()
Expand Down Expand Up @@ -73,7 +78,13 @@ def concatenateCharacterTokens(tokens):


def pprint(walker):
"""Pretty printer for tree walkers"""
"""Pretty printer for tree walkers

Takes a TreeWalker instance and pretty prints the output of walking the tree.

:arg walker: a TreeWalker instance

"""
output = []
indent = 0
for token in concatenateCharacterTokens(walker):
Expand Down
102 changes: 102 additions & 0 deletions html5lib/treewalkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,110 @@


class TreeWalker(object):
"""Walks a tree yielding tokens

Tokens are dicts that all have a ``type`` field specifying the type of the
token.

"""
def __init__(self, tree):
"""Creates a TreeWalker

:arg tree: the tree to walk

"""
self.tree = tree

def __iter__(self):
raise NotImplementedError

def error(self, msg):
"""Generates an error token with the given message

:arg msg: the error message

:returns: SerializeError token

"""
return {"type": "SerializeError", "data": msg}

def emptyTag(self, namespace, name, attrs, hasChildren=False):
"""Generates an EmptyTag token

:arg namespace: the namespace of the token--can be ``None``

:arg name: the name of the element

:arg attrs: the attributes of the element as a dict

:arg hasChildren: whether or not to yield a SerializationError because
this tag shouldn't have children

:returns: EmptyTag token

"""
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")

def startTag(self, namespace, name, attrs):
"""Generates a StartTag token

:arg namespace: the namespace of the token--can be ``None``

:arg name: the name of the element

:arg attrs: the attributes of the element as a dict

:returns: StartTag token

"""
return {"type": "StartTag",
"name": name,
"namespace": namespace,
"data": attrs}

def endTag(self, namespace, name):
"""Generates an EndTag token

:arg namespace: the namespace of the token--can be ``None``

:arg name: the name of the element

:returns: EndTag token

"""
return {"type": "EndTag",
"name": name,
"namespace": namespace}

def text(self, data):
"""Generates SpaceCharacters and Characters tokens

Depending on what's in the data, this generates one or more
``SpaceCharacters`` and ``Characters`` tokens.

For example:

>>> from html5lib.treewalkers.base import TreeWalker
>>> # Give it an empty tree just so it instantiates
>>> walker = TreeWalker([])
>>> list(walker.text(''))
[]
>>> list(walker.text(' '))
[{u'data': ' ', u'type': u'SpaceCharacters'}]
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
[{u'data': ' ', u'type': u'SpaceCharacters'},
{u'data': u'abc', u'type': u'Characters'},
{u'data': u' ', u'type': u'SpaceCharacters'}]

:arg data: the text data

:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens

"""
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
Expand All @@ -60,18 +136,44 @@ def text(self, data):
yield {"type": "SpaceCharacters", "data": right}

def comment(self, data):
"""Generates a Comment token

:arg data: the comment

:returns: Comment token

"""
return {"type": "Comment", "data": data}

def doctype(self, name, publicId=None, systemId=None):
"""Generates a Doctype token

:arg name:

:arg publicId:

:arg systemId:

:returns: the Doctype token

"""
return {"type": "Doctype",
"name": name,
"publicId": publicId,
"systemId": systemId}

def entity(self, name):
"""Generates an Entity token

:arg name: the entity name

:returns: an Entity token

"""
return {"type": "Entity", "name": name}

def unknown(self, nodeType):
"""Handles unknown node types"""
return self.error("Unknown node type: " + nodeType)


Expand Down