-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Search: support section titles inside header tags #9339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac70213
8e7229e
9fd28a1
5f5793e
08d7369
27b62f5
defc159
fd58784
3b7d3df
43896d1
0303e84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,10 +88,23 @@ def _get_main_node(self, html): | |
# checking for common parents between all h nodes. | ||
first_header = body.css_first("h1") | ||
if first_header: | ||
return first_header.parent | ||
return self._get_header_container(first_header).parent | ||
|
||
return body | ||
|
||
def _get_header_container(self, h_tag): | ||
""" | ||
Get the *real* container of a header tag or title. | ||
|
||
If the parent of the ``h`` tag is a ``header`` tag, | ||
then we return the ``header`` tag, | ||
since the header tag acts as a container for the title of the section. | ||
Otherwise, we return the tag itself. | ||
""" | ||
if h_tag.parent.tag == "header": | ||
return h_tag.parent | ||
return h_tag | ||
|
||
def _parse_content(self, content): | ||
"""Converts all new line characters and multiple spaces to a single space.""" | ||
content = content.strip().split() | ||
|
@@ -110,8 +123,6 @@ def _parse_sections(self, title, body): | |
We can have pages that have content before the first title or that don't have a title, | ||
we index that content first under the title of the original page. | ||
""" | ||
body = self._clean_body(body) | ||
|
||
# Index content for pages that don't start with a title. | ||
# We check for sections till 3 levels to avoid indexing all the content | ||
# in this step. | ||
|
@@ -135,7 +146,8 @@ def _parse_sections(self, title, body): | |
for tag in tags: | ||
try: | ||
title, id = self._parse_section_title(tag) | ||
content, _ = self._parse_section_content(tag.next, depth=2) | ||
next_tag = self._get_header_container(tag).next | ||
content, _ = self._parse_section_content(next_tag, depth=2) | ||
yield { | ||
'id': id, | ||
'title': title, | ||
|
@@ -186,10 +198,10 @@ def _is_section(self, tag): | |
""" | ||
Check if `tag` is a section (linkeable header). | ||
|
||
The tag is a section if it's a ``h`` tag. | ||
The tag is a section if it's a ``h`` or a ``header`` tag. | ||
""" | ||
is_header_tag = re.match(r'h\d$', tag.tag) | ||
return is_header_tag | ||
is_h_tag = re.match(r"h\d$", tag.tag) | ||
return is_h_tag or tag.tag == "header" | ||
|
||
def _parse_section_title(self, tag): | ||
""" | ||
|
@@ -199,15 +211,7 @@ def _parse_section_title(self, tag): | |
|
||
- Get the id from the node itself. | ||
- Get the id from the parent node. | ||
|
||
Additionally: | ||
|
||
- Removes permalink values | ||
""" | ||
nodes_to_be_removed = tag.css('.headerlink') | ||
for node in nodes_to_be_removed: | ||
node.decompose() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we not need this anymore? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This element is already being deleted in the |
||
|
||
section_id = tag.attributes.get('id', '') | ||
if not section_id: | ||
parent = tag.parent | ||
|
@@ -328,6 +332,7 @@ def _process_content(self, page, content): | |
title = "" | ||
sections = [] | ||
if body: | ||
body = self._clean_body(body) | ||
title = self._get_page_title(body, html) or page | ||
sections = self._get_sections(title=title, body=body) | ||
else: | ||
|
@@ -417,7 +422,7 @@ def _process_fjson(self, fjson_path): | |
|
||
if 'body' in data: | ||
try: | ||
body = HTMLParser(data["body"]) | ||
body = self._clean_body(HTMLParser(data["body"])) | ||
sections = self._get_sections(title=title, body=body.body) | ||
except Exception: | ||
log.info('Unable to index sections.', path=fjson_path) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
<!DOCTYPE html> | ||
|
||
<!-- | ||
Default pelican theme. | ||
From https://blog.getpelican.com/pelican-4.7-released.html | ||
--> | ||
|
||
<html lang="en"> | ||
<head> | ||
<meta charset="utf-8"> | ||
<title>Pelican 4.7 released</title> | ||
<link rel="stylesheet" href="https://blog.getpelican.com/theme/css/A.main.css.pagespeed.cf.zFbdR40MwZ.css" type="text/css"/> | ||
<link href="https://blog.getpelican.com/feeds/all.atom.xml" type="application/atom+xml" rel="alternate" title="Pelican Development Blog Atom Feed"/> | ||
|
||
<!--[if IE]> | ||
<script src="https://html5shiv.googlecode.com/svn/trunk/html5.js"></script> | ||
<![endif]--> | ||
</head> | ||
|
||
<body id="index" class="home"> | ||
<header id="banner" class="body"> | ||
<h1><a href="https://blog.getpelican.com/">Pelican Development Blog </a></h1> | ||
<nav><ul> | ||
<li class="active"><a href="https://blog.getpelican.com/category/news.html">news</a></li> | ||
<li><a href="https://docs.getpelican.com">documentation</a></li> | ||
<li><a href="https://donate.getpelican.com">contribute</a></li> | ||
<li><a href="/pages/gratitude.html">gratitude</a></li> | ||
</ul></nav> | ||
</header><!-- /#banner --> | ||
<section id="content" class="body"> | ||
<article> | ||
<header> | ||
<h1 class="entry-title"> | ||
<a href="https://blog.getpelican.com/pelican-4.7-released.html" rel="bookmark" title="Permalink to Pelican 4.7 released">Pelican 4.7 released</a></h1> | ||
</header> | ||
|
||
<div class="entry-content"> | ||
<footer class="post-info"> | ||
<abbr class="published" title="2021-10-01T00:00:00+02:00"> | ||
Fri 01 October 2021 | ||
</abbr> | ||
|
||
<address class="vcard author"> | ||
By <a class="url fn" href="https://blog.getpelican.com/author/pelican-contributors.html">Pelican Contributors</a> | ||
</address> | ||
<p>In <a href="https://blog.getpelican.com/category/news.html">news</a>. </p> | ||
|
||
</footer><!-- /.post-info --> <p>Pelican 4.7 is now available. This new release includes the following enhancements, fixes, and tweaks:</p> | ||
<ul class="simple"> | ||
<li>Improve default theme rendering on mobile and other small screen devices <a class="reference external" href="https://github.com/getpelican/pelican/pull/2914">(#2914)</a></li> | ||
</ul> | ||
<p>For more info, please refer to the release page.</p> | ||
<div class="section" id="upgrading-from-previous-releases"> | ||
<h2>Upgrading from previous releases</h2> | ||
<p>Upgrading from Pelican 4.6.x should be smooth and require few (if any) changes to | ||
your environment.</p> | ||
<p>If you run into problems, please see the <a class="reference external" href="https://docs.getpelican.com/en/latest/contribute.html#how-to-get-help">How to Get Help</a> section | ||
of the documentation, and we will update this post with any upgrade tips | ||
contributed by the Pelican community.</p> | ||
</div> | ||
|
||
</div><!-- /.entry-content --> | ||
|
||
</article> | ||
</section> | ||
<section id="extras" class="body"> | ||
<div class="blogroll"> | ||
<h2>links</h2> | ||
<ul> | ||
<li><a href="https://docs.getpelican.com/">Pelican Docs</a></li> | ||
<li><a href="https://donate.getpelican.com/">Support Pelican</a></li> | ||
<li><a href="https://justinmayer.com/">Justin Mayer</a></li> | ||
</ul> | ||
</div><!-- /.blogroll --> | ||
<div class="social"> | ||
<h2>follow</h2> | ||
<ul> | ||
<li><a href="https://blog.getpelican.com/feeds/all.atom.xml" type="application/atom+xml" rel="alternate">atom feed</a></li> | ||
|
||
<li><a href="https://twitter.com/getpelican">@getpelican</a></li> | ||
<li><a href="https://twitter.com/jmayer">@jmayer</a></li> | ||
<li><a href="https://github.com/getpelican">github</a></li> | ||
</ul> | ||
</div><!-- /.social --> | ||
</section><!-- /#extras --> | ||
|
||
<footer id="contentinfo" class="body"> | ||
<address id="about" class="vcard body"> | ||
Proudly powered by <a href="http://getpelican.com/">Pelican</a>, which takes great advantage of <a href="http://python.org">Python</a>. | ||
</address><!-- /#about --> | ||
|
||
<p>The theme is by <a href="http://coding.smashingmagazine.com/2009/08/04/designing-a-html-5-layout-from-scratch/">Smashing Magazine</a>, thanks!</p> | ||
</footer><!-- /#contentinfo --> | ||
|
||
<script>(function(f,a,t,h,o,m){a[h]=a[h]||function(){(a[h].q=a[h].q||[]).push(arguments)};o=f.createElement('script'),m=f.getElementsByTagName('script')[0];o.async=1;o.src=t;o.id='fathom-script';m.parentNode.insertBefore(o,m)})(document,window,'https://stats.justinmayer.com/tracker.js','fathom');fathom('set','siteId','EWNWB');fathom('trackPageview');</script> | ||
<script type="text/javascript">var _gaq=_gaq||[];_gaq.push(['_setAccount','UA-295694-7']);_gaq.push(['_trackPageview']);(function(){var ga=document.createElement('script');ga.type='text/javascript';ga.async=true;ga.src=('https:'==document.location.protocol?'https://ssl':'http://www')+'.google-analytics.com/ga.js';var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(ga,s);})();</script> | ||
</body> | ||
</html> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
[ | ||
{ | ||
"path": "index.html", | ||
"title": "Pelican Development Blog", | ||
"sections": [ | ||
{ | ||
"id": "banner", | ||
"title": "Pelican Development Blog", | ||
"content": "" | ||
}, | ||
{ | ||
"id": "", | ||
"title": "Pelican 4.7 released", | ||
"content": "Fri 01 October 2021 By Pelican Contributors In news. Pelican 4.7 is now available. This new release includes the following enhancements, fixes, and tweaks: Improve default theme rendering on mobile and other small screen devices (#2914) For more info, please refer to the release page." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The date here shouldn't be indexed, it's inside a footer tag, maybe we could also omit the footer tag when indexing
https://developer.mozilla.org/en-US/docs/Web/HTML/Element/footer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure the best approach here. I really wish we could comment JSON :( |
||
}, | ||
{ | ||
"id": "upgrading-from-previous-releases", | ||
"title": "Upgrading from previous releases", | ||
"content": "Upgrading from Pelican 4.6.x should be smooth and require few (if any) changes to your environment. If you run into problems, please see the How to Get Help section of the documentation, and we will update this post with any upgrade tips contributed by the Pelican community." | ||
}, | ||
{ | ||
"id": "", | ||
"title": "links", | ||
"content": "Pelican Docs Support Pelican Justin Mayer" | ||
}, | ||
{ | ||
"id": "", | ||
"title": "follow", | ||
"content": "atom feed @getpelican @jmayer github" | ||
} | ||
Comment on lines
+22
to
+30
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two are valid sections, but shouldn't be indexed. They are outside the pelican's main node ( |
||
], | ||
"domain_data": {} | ||
} | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💯 examples.