Skip to content

Search: index from html files for mkdocs projects #7208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,7 @@ def add_features(sender, **kwargs):
DEDUPLICATE_BUILDS = 'deduplicate_builds'
USE_SPHINX_RTD_EXT_LATEST = 'rtd_sphinx_ext_latest'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'

FEATURES = (
(USE_SPHINX_LATEST, _('Use latest version of Sphinx')),
Expand Down Expand Up @@ -1661,6 +1662,10 @@ def add_features(sender, **kwargs):
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
)

projects = models.ManyToManyField(
Expand Down
111 changes: 110 additions & 1 deletion readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,68 @@ def __init__(self, version):
self.project = self.version.project
self.storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

def _get_page_content(self, page):
"""Gets the page content from storage."""
content = None
try:
storage_path = self.project.get_storage_path(
type_='html',
version_slug=self.version.slug,
include_file=False,
)
file_path = self.storage.join(storage_path, page)
with self.storage.open(file_path, mode='r') as f:
content = f.read()
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
page,
)
return content

def _get_page_title(self, body, html):
"""
Gets the title from the html page.

The title is the first section in the document,
falling back to the ``title`` tag.
"""
first_header = body.css_first('h1')
if first_header:
title, _ = self._parse_section_title(first_header)
return title

title = html.css_first('title')
if title:
return self._parse_content(title.text())

return None

def _get_main_node(self, html):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is definitely part of the complexity that an "only HTML" parser runs into. We're definitely going to end up parsing random navigation and other content on pages, just because we're relying on heuristics instead of the tool telling us what the exact main content is.

I don't think we can avoid that, but definitely a downside of generic HTML parsing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, that's why I want just to write a guide for authors of theme/static site generators. That would also make their site to be better indexed by search engines and work better with screen readers.

"""
Gets the main node from where to start indexing content.

The main node is tested in the following order:

- Try with a tag with the ``main`` role.
This role is used by several static sites and themes.
- Try the first ``h1`` node and return its parent
Usually all sections are neighbors,
so they are children of the same parent node.
"""
body = html.body
main_node = body.css_first('[role=main]')
if main_node:
return main_node

# TODO: this could be done in smarter way,
# checking for common parents between all h nodes.
first_header = body.css_first('h1')
if first_header:
return first_header.parent

return None

def _parse_content(self, content):
"""Removes new line characters and strips all whitespaces."""
content = content.strip().split('\n')
Expand Down Expand Up @@ -404,9 +466,56 @@ def _parse_domain_tag(self, tag):

class MkDocsParser(BaseParser):

"""MkDocs parser, it relies on the json index files."""
"""
MkDocs parser.

Index from the json index file or directly from the html content.
"""

def parse(self, page):
# Avoid circular import
from readthedocs.projects.models import Feature
if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
return self.parse_from_html(page)
return self.parse_from_index_file(page)

def parse_from_html(self, page):
try:
content = self._get_page_content(page)
if content:
return self._process_content(page, content)
except Exception as e:
log.info('Failed to index page %s, %s', page, str(e))
return {
'path': page,
'title': '',
'sections': [],
'domain_data': {},
}

def _process_content(self, page, content):
"""Parses the content into a structured dict."""
html = HTMLParser(content)
body = self._get_main_node(html)
title = ""
sections = []
if body:
title = self._get_page_title(body, html) or page
sections = list(self._parse_sections(title, body))
else:
log.info(
'Page doesn\'t look like it has valid content, skipping. '
'page=%s',
page,
)
return {
'path': page,
'title': title,
'sections': sections,
'domain_data': {},
}

def parse_from_index_file(self, page):
storage_path = self.project.get_storage_path(
type_='html',
version_slug=self.version.slug,
Expand Down
121 changes: 121 additions & 0 deletions readthedocs/search/tests/data/mkdocs/in/gitbook/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
<!DOCTYPE html>

<!--
Gitbook them https://gitlab.com/lramage/mkdocs-gitbook-theme
From https://lramage.gitlab.io/mkdocs-gitbook-theme/
-->

<html lang="en">
<head>
<meta charset="utf-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Mkdocs - GitBook Theme - Mkdocs - GitBook Theme</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge">

<meta name="generator" content="mkdocs-1.1.2, mkdocs-gitbook-1.0.7">

<link rel="shortcut icon" href="./images/favicon.ico" type="image/x-icon">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<meta rel="next" href="" />
<link href="./css/style.min.css" rel="stylesheet">
</head>

<body>
<div class="book">
<div class="book-summary">

<nav role="navigation">
<ul class="summary">
<li>
<a href="." target="_blank" class="custom-link">Mkdocs - GitBook Theme</a>
</li>
<li class="divider"></li>
<li class="chapter active" data-path="">
<a href=".">Mkdocs - GitBook Theme</a>
<li class="header">Post</li>

<li>
<a href="post/2015-10-30/" class="">Oldest Post</a>
</li>

<li>
<a href="post/2018-12-31/" class="">Older Post</a>
</li>

<li>
<a href="post/2019-01-02/" class="">Latest Post</a>
</li>

<li class="divider"></li>



<li><a href="http://www.mkdocs.org">
Published with MkDocs
</a></li>

<li><a href="https://github.com/GitbookIO/theme-default">
Theme by GitBook
</a></li>
</ul>

</nav>

</div> <!-- end of book-summary -->

<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">

<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href="." ></a>
</h1>

</div> <!-- end of book-header -->

<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">

<section class="normal markdown-section">

<h1 id="mkdocs-gitbook-theme">Mkdocs - GitBook Theme</h1>
<p><a href="LICENSE"><img alt="Apache 2.0 License" src="https://img.shields.io/badge/license-Apache--2.0-blue.svg?style=flat-square" /></a>
<a href="https://pypi.python.org/pypi/mkdocs-gitbook"><img alt="PyPI" src="https://img.shields.io/pypi/v/mkdocs-gitbook.svg?style=flat-square" /></a></p>
<h2 id="installation">Installation</h2>
<p>First, install the package via PyPI:</p>
<pre><code class="sh">pip install mkdocs-gitbook
</code></pre>

<p>Then include the theme in your <code>mkdocs.yml</code> file:</p>
<pre><code class="yaml">theme:
name: gitbook
</code></pre>

<h2 id="motivation">Motivation</h2>
<p>Gitbook was a static-site generator written in JavaScript.</p>
<p>Mkdocs is a static-site generator written in Python.</p>
<p><strong>Gitbook is <a href="https://docs.gitbook.com/v2-changes/important-differences#cli-toolchain">no longer a static-site generator</a>, <a href="https://docs.gitbook.com/v2-changes/important-differences#git-hosting-and-integration">nor does it use git</a>, nor is it <a href="https://www.gnu.org/philosophy/free-sw.html">free</a> or <a href="https://opensource.org/osd">open source</a>!</strong></p>
<h2 id="screenshot">Screenshot</h2>
<p><a href="https://gitlab.com/lramage/mkdocs-gitbook-theme"><img src="img/screenshot.png" alt="Default theme for GitBook for Mkdocs"></a></p>
<h2 id="license">License</h2>
<p>SPDX-License-Identifier: <a href="https://spdx.org/licenses/Apache-2.0">Apache-2.0</a></p>


</section>

</div> <!-- end of page-inner -->
</div> <!-- end of page-wrapper -->

</div> <!-- end of body-inner -->

</div> <!-- end of book-body -->
<script src="./js/main.js"></script>
<script src="./js/gitbook.min.js"></script>
<script src="./js/theme.min.js"></script>
</body>
</html>
Loading