Skip to content

html_to_vdom transform to remove html/body but preserve head content #832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Dec 1, 2022
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
975f5f2
Insert head content into html_to_vdom
Archmonger Nov 9, 2022
77270a2
more efficient node appending
Archmonger Nov 9, 2022
a815efc
add changelog
Archmonger Nov 9, 2022
745a537
nodes -> body node
Archmonger Nov 9, 2022
ac7d948
parsed_document -> html_node
Archmonger Nov 9, 2022
025a42d
remove useless has_root_node variable
Archmonger Nov 9, 2022
a4a076f
fix comment
Archmonger Nov 9, 2022
a70ddfa
del_html_body_transform
Archmonger Nov 13, 2022
bc890a4
fix type hint errors
Archmonger Nov 13, 2022
763ef8e
type hint fixes 2
Archmonger Nov 13, 2022
0e8cbf7
remove unusable head and body tags
Archmonger Nov 13, 2022
45b3c2e
uno mas
Archmonger Nov 13, 2022
9e740e8
import future annotations
Archmonger Nov 13, 2022
24a788d
fix docs warnings
Archmonger Nov 13, 2022
ab3cb04
clean up last warning
Archmonger Nov 13, 2022
ce2c972
fix docstrings
Archmonger Nov 13, 2022
e5aba03
del_html_body_transform docstrings
Archmonger Nov 13, 2022
a348a98
re-add head API
Archmonger Nov 14, 2022
a0407d0
fix changelog PR links
Archmonger Nov 14, 2022
16285ff
docstring cleanup
Archmonger Nov 14, 2022
d78afdb
Better type hint
rmorshea Nov 19, 2022
18e781b
Revert "Better type hint"
Archmonger Nov 23, 2022
b290afa
Merge remote-tracking branch 'upstream/main' into html_to_vdom_head_c…
Archmonger Nov 23, 2022
2a9d2e2
more concise transform implementation
Archmonger Nov 30, 2022
5b08771
Merge remote-tracking branch 'upstream/main' into html_to_vdom_head_c…
Archmonger Nov 30, 2022
dcec026
fix merge error
Archmonger Nov 30, 2022
bdd5f16
merge changelog better
Archmonger Nov 30, 2022
e4267a5
annotate as VdomDict
rmorshea Dec 1, 2022
54423f3
Merge branch 'main' into html_to_vdom_head_content
rmorshea Dec 1, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/_custom_js/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion docs/source/about/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@ more info, see the :ref:`Contributor Guide <Creating a Changelog Entry>`.
Unreleased
----------

No changes.
**Added**

- :pull:`832` - ``del_html_body_transform`` to remove ``<html>``, ``<head>``, and ``<body>`` while preserving ``<head>`` and ``<body>`` children.

**Fixed**

- :pull:`832` - Fix ``html_to_vdom`` improperly handling ``<html>``, ``<head>``, and ``<body>``.

**Removed**
- :pull:`832` - Removed ``idom.html.body`` as it is currently unusable due to technological limitations, and thus not needed.


v0.41.0
Expand Down
2 changes: 0 additions & 2 deletions src/idom/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

**Content sectioning**

- :func:`body`
- :func:`address`
- :func:`article`
- :func:`aside`
Expand Down Expand Up @@ -189,7 +188,6 @@ def _(*children: Any, key: Key | None = None) -> VdomDict:
title = make_vdom_constructor("title")

# Content sectioning
body = make_vdom_constructor("body")
address = make_vdom_constructor("address")
article = make_vdom_constructor("article")
aside = make_vdom_constructor("aside")
Expand Down
84 changes: 53 additions & 31 deletions src/idom/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

from itertools import chain
from typing import Any, Callable, Generic, Iterable, List, TypeVar, Union

from lxml import etree
from lxml.html import fragments_fromstring
from lxml.html import fromstring

import idom
from idom.core.types import VdomDict
Expand Down Expand Up @@ -63,7 +65,7 @@ def html_to_vdom(
using a ``key=...`` attribute within your HTML tag.

Parameters:
source:
html:
The raw HTML as a string
transforms:
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
Expand All @@ -77,15 +79,15 @@ def html_to_vdom(
raise TypeError(f"Expected html to be a string, not {type(html).__name__}")

# If the user provided a string, convert it to a list of lxml.etree nodes
parser = etree.HTMLParser(
remove_comments=True,
remove_pis=True,
remove_blank_text=True,
recover=not strict,
)
try:
nodes: list[etree._Element] = fragments_fromstring(
html, no_leading_text=True, parser=parser
root_node: etree._Element = fromstring(
html.strip(),
parser=etree.HTMLParser(
remove_comments=True,
remove_pis=True,
remove_blank_text=True,
recover=not strict,
),
)
except etree.XMLSyntaxError as e:
if not strict:
Expand All @@ -97,34 +99,17 @@ def html_to_vdom(
"you can disable the strict parameter on html_to_vdom().\n"
"Otherwise, repair your broken HTML and try again."
) from e
has_root_node = len(nodes) == 1

# Find or create a root node
if has_root_node:
root_node = nodes[0]
else:
# etree.Element requires a non-empty tag - we correct this below
root_node = etree.Element("TEMP", None, None)
for child in nodes:
root_node.append(child)

# Convert the lxml node to a VDOM dict
vdom = _etree_to_vdom(root_node, transforms)

# Change the artificially created root node to a React Fragment, instead of a div
if not has_root_node:
vdom["tagName"] = ""

return vdom
return _etree_to_vdom(root_node, transforms)


def _etree_to_vdom(
node: etree._Element, transforms: Iterable[_ModelTransform]
) -> VdomDict:
"""Recusively transform an lxml etree node into a DOM model
"""Transform an lxml etree node into a DOM model

Parameters:
source:
node:
The ``lxml.etree._Element`` node
transforms:
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
Expand All @@ -136,7 +121,7 @@ def _etree_to_vdom(
f"Expected node to be a etree._Element, not {type(node).__name__}"
)

# This will recursively call _etree_to_vdom() on all children
# Recursively call _etree_to_vdom() on all children
children = _generate_vdom_children(node, transforms)

# Convert the lxml node to a VDOM dict
Expand Down Expand Up @@ -223,3 +208,40 @@ def _hypen_to_camel_case(string: str) -> str:

class HTMLParseError(etree.LxmlSyntaxError): # type: ignore[misc]
"""Raised when an HTML document cannot be parsed using strict parsing."""


def del_html_body_transform(vdom: dict[str, Any]) -> dict[str, Any]:
"""Transform intended for use with `html_to_vdom`.

Removes `<html>`, `<head>`, and `<body>` while preserving `<head>` and `<body>` children.

Parameters:
vdom:
The VDOM dictionary to transform.
"""
if vdom["tagName"] == "html":
vdom["tagName"] = ""

# Remove all fields from `<html>` except for `children` and `tagName`
for key in list(vdom.keys()):
if key not in ("children", "tagName"):
del vdom[key]

# Preserve `<head>` children and remove the `<body>` tag
head_and_body_children = []
for child in vdom.get("children", []):
# Add `<head>` children to the list
if child["tagName"] == "head":
head_and_body_children.extend(child.get("children", []))

# Add `<body>` children to the list, then remove `<body>` and `<head>`
if child.get("tagName", None) == "body":
head_and_body_children.extend(child.get("children", []))
vdom["children"] = head_and_body_children
break

# Set vdom to the first child if there's only one child
if len(vdom.get("children", [])) == 1:
vdom = vdom["children"][0]

return vdom
44 changes: 42 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

import idom
from idom.utils import HTMLParseError, html_to_vdom
from idom.utils import HTMLParseError, del_html_body_transform, html_to_vdom


def test_basic_ref_behavior():
Expand Down Expand Up @@ -141,11 +141,51 @@ def test_html_to_vdom_with_no_parent_node():
source = "<p>Hello</p><div>World</div>"

expected = {
"tagName": "",
"tagName": "div",
"children": [
{"tagName": "p", "children": ["Hello"]},
{"tagName": "div", "children": ["World"]},
],
}

assert html_to_vdom(source) == expected


def test_del_html_body_transform():
source = """
<!DOCTYPE html>
<html lang="en">

<head>
<title>My Title</title>
</head>

<body><h1>Hello World</h1></body>

</html>
"""

expected = {
"tagName": "",
"children": [
{"tagName": "title", "children": ["My Title"]},
{"tagName": "h1", "children": ["Hello World"]},
],
}

assert html_to_vdom(source, del_html_body_transform) == expected


def test_del_html_body_transform_no_head():
source = """
<!DOCTYPE html>
<html lang="en">

<body><h1>Hello World</h1></body>

</html>
"""

expected = {"tagName": "h1", "children": ["Hello World"]}

assert html_to_vdom(source, del_html_body_transform) == expected