From 49b36e8cb62484273cb4cf5a813c94442e6662f3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 4 Dec 2017 12:10:13 -0500 Subject: [PATCH] Document html5lib.treewalkers --- html5lib/treewalkers/__init__.py | 41 ++++++++----- html5lib/treewalkers/base.py | 102 +++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 15 deletions(-) diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py index 402b722e..9bec2076 100644 --- a/html5lib/treewalkers/__init__.py +++ b/html5lib/treewalkers/__init__.py @@ -21,20 +21,25 @@ def getTreeWalker(treeType, implementation=None, **kwargs): """Get a TreeWalker class for various types of tree with built-in support - Args: - treeType (str): the name of the tree type required (case-insensitive). - Supported values are: - - - "dom": The xml.dom.minidom DOM implementation - - "etree": A generic walker for tree implementations exposing an - elementtree-like interface (known to work with - ElementTree, cElementTree and lxml.etree). - - "lxml": Optimized walker for lxml.etree - - "genshi": a Genshi stream - - Implementation: A module implementing the tree type e.g. - xml.etree.ElementTree or cElementTree (Currently applies to the - "etree" tree type only). + :arg str treeType: the name of the tree type required (case-insensitive). + Supported values are: + + * "dom": The xml.dom.minidom DOM implementation + * "etree": A generic walker for tree implementations exposing an + elementtree-like interface (known to work with ElementTree, + cElementTree and lxml.etree). + * "lxml": Optimized walker for lxml.etree + * "genshi": a Genshi stream + + :arg implementation: A module implementing the tree type e.g. + xml.etree.ElementTree or cElementTree (Currently applies to the "etree" + tree type only). + + :arg kwargs: keyword arguments passed to the etree walker--for other + walkers, this has no effect + + :returns: a TreeWalker class + """ treeType = treeType.lower() @@ -73,7 +78,13 @@ def concatenateCharacterTokens(tokens): def pprint(walker): - """Pretty printer for tree walkers""" + """Pretty printer for tree walkers + + Takes a TreeWalker instance and pretty prints the output of walking the tree. + + :arg walker: a TreeWalker instance + + """ output = [] indent = 0 for token in concatenateCharacterTokens(walker): diff --git a/html5lib/treewalkers/base.py b/html5lib/treewalkers/base.py index 36e1ba24..80c474c4 100644 --- a/html5lib/treewalkers/base.py +++ b/html5lib/treewalkers/base.py @@ -18,16 +18,48 @@ class TreeWalker(object): + """Walks a tree yielding tokens + + Tokens are dicts that all have a ``type`` field specifying the type of the + token. + + """ def __init__(self, tree): + """Creates a TreeWalker + + :arg tree: the tree to walk + + """ self.tree = tree def __iter__(self): raise NotImplementedError def error(self, msg): + """Generates an error token with the given message + + :arg msg: the error message + + :returns: SerializeError token + + """ return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): + """Generates an EmptyTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :arg attrs: the attributes of the element as a dict + + :arg hasChildren: whether or not to yield a SerializationError because + this tag shouldn't have children + + :returns: EmptyTag token + + """ yield {"type": "EmptyTag", "name": name, "namespace": namespace, "data": attrs} @@ -35,17 +67,61 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False): yield self.error("Void element has children") def startTag(self, namespace, name, attrs): + """Generates a StartTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :arg attrs: the attributes of the element as a dict + + :returns: StartTag token + + """ return {"type": "StartTag", "name": name, "namespace": namespace, "data": attrs} def endTag(self, namespace, name): + """Generates an EndTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :returns: EndTag token + + """ return {"type": "EndTag", "name": name, "namespace": namespace} def text(self, data): + """Generates SpaceCharacters and Characters tokens + + Depending on what's in the data, this generates one or more + ``SpaceCharacters`` and ``Characters`` tokens. + + For example: + + >>> from html5lib.treewalkers.base import TreeWalker + >>> # Give it an empty tree just so it instantiates + >>> walker = TreeWalker([]) + >>> list(walker.text('')) + [] + >>> list(walker.text(' ')) + [{u'data': ' ', u'type': u'SpaceCharacters'}] + >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE + [{u'data': ' ', u'type': u'SpaceCharacters'}, + {u'data': u'abc', u'type': u'Characters'}, + {u'data': u' ', u'type': u'SpaceCharacters'}] + + :arg data: the text data + + :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens + + """ data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] @@ -60,18 +136,44 @@ def text(self, data): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): + """Generates a Comment token + + :arg data: the comment + + :returns: Comment token + + """ return {"type": "Comment", "data": data} def doctype(self, name, publicId=None, systemId=None): + """Generates a Doctype token + + :arg name: + + :arg publicId: + + :arg systemId: + + :returns: the Doctype token + + """ return {"type": "Doctype", "name": name, "publicId": publicId, "systemId": systemId} def entity(self, name): + """Generates an Entity token + + :arg name: the entity name + + :returns: an Entity token + + """ return {"type": "Entity", "name": name} def unknown(self, nodeType): + """Handles unknown node types""" return self.error("Unknown node type: " + nodeType)