Skip to content

Rewrite and Refractor #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
sphinx
sphinx==3.1.2
wheel==0.34.2
pytest==5.4.3
beautifulsoup4==4.9.1
setuptools==47.3.1
setuptools==49.2.0
docutils==0.16
File renamed without changes.
23 changes: 23 additions & 0 deletions sphinxext/opengraph/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Any, Dict
from sphinx.application import Sphinx
from .event_handler import html_page_context

# todo: rtd option to grab READTHEDOCS_LANGUAGE and READTHEDOCS_VERSION for link
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can put these TODOs in a separate PR after this is merged.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, those are going to be gone before I remove it from being a draft, just waiting for an opinion on the lists and sections to close that part off

# todo: grab first image on a page


def setup(app: Sphinx) -> Dict[str, Any]:
app.add_config_value("ogp_site_url", None, "html")
app.add_config_value("ogp_description_length", 200, "html")
app.add_config_value("ogp_image", None, "html")
app.add_config_value("ogp_image_alt", True, "html")
app.add_config_value("ogp_type", "website", "html")
app.add_config_value("ogp_site_name", None, "html")
app.add_config_value("ogp_custom_meta_tags", [], "html")

app.connect("html-page-context", html_page_context)

return {
"parallel_read_safe": True,
"parallel_write_safe": True,
}
10 changes: 10 additions & 0 deletions sphinxext/opengraph/directive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import List
from sphinx.util.docutils import SphinxDirective
from docutils.nodes import Node


class OpenGraphDirective(SphinxDirective):
def run(self) -> List[Node]:
pass

pass
46 changes: 46 additions & 0 deletions sphinxext/opengraph/event_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import Dict, Any
from sphinx.application import Sphinx
from docutils import nodes
from urllib.parse import urljoin
from .util import sanitize_title, make_tag, add_tag_from_config
from .visitor import OpenGraphVisitor


def html_page_context(
app: Sphinx,
pagename: str,
templatename: str,
context: Dict[str, Any],
doctree: nodes.document,
) -> None:
if not doctree:
return
# Set length of description
try:
desc_len = int(app.config["ogp_description_length"])
except ValueError:
desc_len = 200
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Constants shouldn't be defined twice.
Since app.add_config_value("ogp_description_length", 200, "html") is already in __init__.py, this isn't necessary.


# grab the description from the page
visitor = OpenGraphVisitor(doctree, desc_len)
doctree.walkabout(visitor)

# parse out html from the page title
page_title = sanitize_title(context["title"])

# get the page's url
page_url = urljoin(
app.config["ogp_site_url"], context["pagename"] + context["file_suffix"]
)

# add all the tags which need to be added
context["metatags"] += make_tag("title", page_title)
context["metatags"] += make_tag("type", app.config["ogp_type"])
context["metatags"] += make_tag("url", page_url)
add_tag_from_config(context, app.config, "site_name")
context["metatags"] += make_tag("description", visitor.description)
add_tag_from_config(context, app.config, "image")
add_tag_from_config(context, app.config, "image:alt")
Comment on lines +40 to +43
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the mixed usage of context["metatags"] += and add_tag_from_config. I think add_tag_from_config reduces readability and should be removed.


# custom tags
context["metatags"] += "\n".join(app.config["ogp_custom_meta_tags"])
15 changes: 15 additions & 0 deletions sphinxext/opengraph/html_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from html.parser import HTMLParser


class HTMLTextParser(HTMLParser):
"""
Parse HTML into text
"""

def __init__(self):
super().__init__()
# Text from html tags
self.text = ""

def handle_data(self, data) -> None:
self.text += data
29 changes: 29 additions & 0 deletions sphinxext/opengraph/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import Dict, Any
from sphinx.config import Config
from .html_parser import HTMLTextParser


def make_tag(tag: str, content: str) -> str:
return f'<meta property="og:{tag}" content="{content}" />\n '


def add_tag_from_config(
context: Dict[str, Any], config: Config, tag: str, conf_name: str = ""
):
if not conf_name:
conf_name = tag.replace(":", "_")

conf_name = f"ogp_{conf_name}"

value = config[conf_name]
if value:
context["metatags"] += make_tag(tag, value)


def sanitize_title(title: str) -> str:
# take in a title and return a clean version.
html_parser = HTMLTextParser()
html_parser.feed(title)
html_parser.close()

return html_parser.text
63 changes: 63 additions & 0 deletions sphinxext/opengraph/visitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from docutils import nodes


class OpenGraphVisitor(nodes.GenericNodeVisitor):
def __init__(
self, document: nodes.document, desc_len: int, title_count: int = 1
) -> None:
super().__init__(document)
self.description = ""
self.desc_len = desc_len
self.title_count = title_count
self.first_list = "" # todo: rename

def default_visit(self, node: nodes.Element) -> None:
if len(self.description) >= self.desc_len:
# Stop the traversal if the description is long enough
# raise nodes.StopTraversal
pass

if isinstance(node, (nodes.Invisible, nodes.Admonition)):
# Skip all comments and admonitions
raise nodes.SkipNode

if isinstance(node, nodes.title):
# title count refers to the amount of titles remaining
self.title_count -= 1
if self.title_count < 0:
Comment on lines +26 to +27
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

counts should count upwards.

Also, what if the first section does not have text?
Example: https://docs.wpilib.org/en/latest/docs/software/driverstation/manually-setting-the-driver-station-to-start-custom-dashboard.html

If we start counting sections, we should either:

  1. Start counting at the first section containing text
  2. Only count sections containing text.

Copy link
Collaborator Author

@ItayZiv ItayZiv Jul 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forgot to ask about this part in the discord, but this is another part I wasn't really sure about.
Right now the idea is the user can enter how many sections to search, but another possibility would be to only grab the first section that isn't empty.
I agree that regardless empty sections should be ignored.
Not sure if we should put in the titles of empty sections or ignore all titles as it is now.
Edit: downwards counting was to reduce variable count if it is something the user inputs, probably will have to change to make the changes above.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding titles, the options I see are either:

  1. Only include titles of sections with text
  2. Ignore all titles.

Either is fine really.

Regarding counting sections:
Counting sections falls counter to description length. If the user specifies 300 characters, then it may not be intuitive as to why a lot less characters were included.

I'm fine with providing options for both description_length and num_sections but it would need to be clear in the readme that the amount of text grabbed is pseudo_min(description_length, num_sections).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the counting sections, my train of thought is that ogp:description is meant to be a small short description of the page, just grabbing the first X chars means it will always just be everything that can fit, while grabbing the first paragraph will most likely be some kind of either description of the page, or snippet of the contents of the page.
I wouldn't be opposed to having it only grab the first section with text and renaming the desc_len to max_desc_len.
The other option would be to have that configurable, where it can grab the first 300 chars and ignore all titles, or do as mentioned above.
We could get into more complex title impls, but I feel like the thing I said above should be the default and be the best in most use cases.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grabbing just one section isn't enough. The previous impl by a Sphinx Maintainer was along the lines of pseudomin(3 sections, 200 char). I agree with your argument for counting sections. I just think that both should be options.

# raise nodes.StopTraversal
pass

if not self.first_list and isinstance(
node, (nodes.bullet_list, nodes.enumerated_list)
):
# get the first list on a page to be used instead of description
self.first_list = node.astext().replace("\n\n", ", ")

if isinstance(node, nodes.paragraph):
# Add only paragraphs to the description

if self.description:
# add a space between paragraphs in case one does not exist
# if there is a double space it will be removed automatically
self.description += " "

self.description += node.astext()

def default_departure(self, node: nodes.Element) -> None:
# remove all double spaces
self.description = self.description.replace(" ", " ")

if len(self.description) > self.desc_len:
self.description = self.description[: self.desc_len]

if self.desc_len > 3:
self.description = self.description[:-3] + "..."

# runs only when this is the last node
if node.parent is None:
# switch to use the list if the description is only the same list
if self.description == self.first_list.replace(
",", ""
):
self.description = self.first_list