diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py index e2328847..d44447ea 100644 --- a/html5lib/treebuilders/__init__.py +++ b/html5lib/treebuilders/__init__.py @@ -1,29 +1,32 @@ -"""A collection of modules for building different kinds of tree from -HTML documents. +"""A collection of modules for building different kinds of trees from HTML +documents. To create a treebuilder for a new type of tree, you need to do implement several things: -1) A set of classes for various types of elements: Document, Doctype, -Comment, Element. These must implement the interface of -_base.treebuilders.Node (although comment nodes have a different -signature for their constructor, see treebuilders.etree.Comment) -Textual content may also be implemented as another node type, or not, as -your tree implementation requires. - -2) A treebuilder object (called TreeBuilder by convention) that -inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: -documentClass - the class to use for the bottommost node of a document -elementClass - the class to use for HTML Elements -commentClass - the class to use for comments -doctypeClass - the class to use for doctypes -It also has one required method: -getDocument - Returns the root node of the complete document tree - -3) If you wish to run the unit tests, you must also create a -testSerializer method on your treebuilder which accepts a node and -returns a string containing Node and its children serialized according -to the format used in the unittests +1. A set of classes for various types of elements: Document, Doctype, Comment, + Element. These must implement the interface of ``base.treebuilders.Node`` + (although comment nodes have a different signature for their constructor, + see ``treebuilders.etree.Comment``) Textual content may also be implemented + as another node type, or not, as your tree implementation requires. + +2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits + from ``treebuilders.base.TreeBuilder``. This has 4 required attributes: + + * ``documentClass`` - the class to use for the bottommost node of a document + * ``elementClass`` - the class to use for HTML Elements + * ``commentClass`` - the class to use for comments + * ``doctypeClass`` - the class to use for doctypes + + It also has one required method: + + * ``getDocument`` - Returns the root node of the complete document tree + +3. If you wish to run the unit tests, you must also create a ``testSerializer`` + method on your treebuilder which accepts a node and returns a string + containing Node and its children serialized according to the format used in + the unittests + """ from __future__ import absolute_import, division, unicode_literals @@ -34,23 +37,32 @@ def getTreeBuilder(treeType, implementation=None, **kwargs): - """Get a TreeBuilder class for various types of tree with built-in support - - treeType - the name of the tree type required (case-insensitive). Supported - values are: - - "dom" - A generic builder for DOM implementations, defaulting to - a xml.dom.minidom based implementation. - "etree" - A generic builder for tree implementations exposing an - ElementTree-like interface, defaulting to - xml.etree.cElementTree if available and - xml.etree.ElementTree if not. - "lxml" - A etree-based builder for lxml.etree, handling - limitations of lxml's implementation. - - implementation - (Currently applies to the "etree" and "dom" tree types). A - module implementing the tree type e.g. - xml.etree.ElementTree or xml.etree.cElementTree.""" + """Get a TreeBuilder class for various types of trees with built-in support + + :arg treeType: the name of the tree type required (case-insensitive). Supported + values are: + + * "dom" - A generic builder for DOM implementations, defaulting to a + xml.dom.minidom based implementation. + * "etree" - A generic builder for tree implementations exposing an + ElementTree-like interface, defaulting to xml.etree.cElementTree if + available and xml.etree.ElementTree if not. + * "lxml" - A etree-based builder for lxml.etree, handling limitations + of lxml's implementation. + + :arg implementation: (Currently applies to the "etree" and "dom" tree + types). A module implementing the tree type e.g. xml.etree.ElementTree + or xml.etree.cElementTree. + + :arg kwargs: Any additional options to pass to the TreeBuilder when + creating it. + + Example: + + >>> from html5lib.treebuilders import getTreeBuilder + >>> builder = getTreeBuilder('etree') + + """ treeType = treeType.lower() if treeType not in treeBuilderCache: diff --git a/html5lib/treebuilders/base.py b/html5lib/treebuilders/base.py index a4b2792a..05d97ecc 100644 --- a/html5lib/treebuilders/base.py +++ b/html5lib/treebuilders/base.py @@ -21,22 +21,25 @@ class Node(object): + """Represents an item in the tree""" def __init__(self, name): - """Node representing an item in the tree. - name - The tag name associated with the node - parent - The parent of the current node (or None for the document node) - value - The value of the current node (applies to text nodes and - comments - attributes - a dict holding name, value pairs for attributes of the node - childNodes - a list of child nodes of the current node. This must - include all elements but not necessarily other node types - _flags - A list of miscellaneous flags that can be set on the node + """Creates a Node + + :arg name: The tag name associated with the node + """ + # The tag name assocaited with the node self.name = name + # The parent of the current node (or None for the document node) self.parent = None + # The value of the current node (applies to text nodes and comments) self.value = None + # A dict holding name -> value pairs for attributes of the node self.attributes = {} + # A list of child nodes of the current node. This must include all + # elements but not necessarily other node types. self.childNodes = [] + # A list of miscellaneous flags that can be set on the node. self._flags = [] def __str__(self): @@ -53,23 +56,41 @@ def __repr__(self): def appendChild(self, node): """Insert node as a child of the current node + + :arg node: the node to insert + """ raise NotImplementedError def insertText(self, data, insertBefore=None): """Insert data as text in the current node, positioned before the start of node insertBefore or to the end of the node's text. + + :arg data: the data to insert + + :arg insertBefore: True if you want to insert the text before the node + and False if you want to insert it after the node + """ raise NotImplementedError def insertBefore(self, node, refNode): """Insert node as a child of the current node, before refNode in the list of child nodes. Raises ValueError if refNode is not a child of - the current node""" + the current node + + :arg node: the node to insert + + :arg refNode: the child node to insert the node before + + """ raise NotImplementedError def removeChild(self, node): """Remove node from the children of the current node + + :arg node: the child node to remove + """ raise NotImplementedError @@ -77,6 +98,9 @@ def reparentChildren(self, newParent): """Move all the children of the current node to newParent. This is needed so that trees that don't store text as nodes move the text in the correct way + + :arg newParent: the node to move all this node's children to + """ # XXX - should this method be made more general? for child in self.childNodes: @@ -121,10 +145,12 @@ def nodesEqual(self, node1, node2): class TreeBuilder(object): """Base treebuilder implementation - documentClass - the class to use for the bottommost node of a document - elementClass - the class to use for HTML Elements - commentClass - the class to use for comments - doctypeClass - the class to use for doctypes + + * documentClass - the class to use for the bottommost node of a document + * elementClass - the class to use for HTML Elements + * commentClass - the class to use for comments + * doctypeClass - the class to use for doctypes + """ # pylint:disable=not-callable @@ -144,6 +170,11 @@ class TreeBuilder(object): fragmentClass = None def __init__(self, namespaceHTMLElements): + """Create a TreeBuilder + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + """ if namespaceHTMLElements: self.defaultNamespace = "http://www.w3.org/1999/xhtml" else: @@ -367,11 +398,11 @@ def generateImpliedEndTags(self, exclude=None): self.generateImpliedEndTags(exclude) def getDocument(self): - "Return the final tree" + """Return the final tree""" return self.document def getFragment(self): - "Return the final fragment" + """Return the final fragment""" # assert self.innerHTML fragment = self.fragmentClass() self.openElements[0].reparentChildren(fragment) @@ -379,5 +410,8 @@ def getFragment(self): def testSerializer(self, node): """Serialize the subtree of node in the format required by unit tests - node - the node from which to start serializing""" + + :arg node: the node from which to start serializing + + """ raise NotImplementedError diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 908820c0..ca12a99c 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -309,7 +309,6 @@ def insertCommentMain(self, data, parent=None): super(TreeBuilder, self).insertComment(data, parent) def insertRoot(self, token): - """Create the document root""" # Because of the way libxml2 works, it doesn't seem to be possible to # alter information like the doctype after the tree has been parsed. # Therefore we need to use the built-in parser to create our initial