@@ -88,10 +88,23 @@ def _get_main_node(self, html):
88
88
# checking for common parents between all h nodes.
89
89
first_header = body .css_first ("h1" )
90
90
if first_header :
91
- return first_header .parent
91
+ return self . _get_header_container ( first_header ) .parent
92
92
93
93
return body
94
94
95
+ def _get_header_container (self , h_tag ):
96
+ """
97
+ Get the *real* container of a header tag or title.
98
+
99
+ If the parent of the ``h`` tag is a ``header`` tag,
100
+ then we return the ``header`` tag,
101
+ since the header tag acts as a container for the title of the section.
102
+ Otherwise, we return the tag itself.
103
+ """
104
+ if h_tag .parent .tag == "header" :
105
+ return h_tag .parent
106
+ return h_tag
107
+
95
108
def _parse_content (self , content ):
96
109
"""Converts all new line characters and multiple spaces to a single space."""
97
110
content = content .strip ().split ()
@@ -110,8 +123,6 @@ def _parse_sections(self, title, body):
110
123
We can have pages that have content before the first title or that don't have a title,
111
124
we index that content first under the title of the original page.
112
125
"""
113
- body = self ._clean_body (body )
114
-
115
126
# Index content for pages that don't start with a title.
116
127
# We check for sections till 3 levels to avoid indexing all the content
117
128
# in this step.
@@ -135,7 +146,8 @@ def _parse_sections(self, title, body):
135
146
for tag in tags :
136
147
try :
137
148
title , id = self ._parse_section_title (tag )
138
- content , _ = self ._parse_section_content (tag .next , depth = 2 )
149
+ next_tag = self ._get_header_container (tag ).next
150
+ content , _ = self ._parse_section_content (next_tag , depth = 2 )
139
151
yield {
140
152
'id' : id ,
141
153
'title' : title ,
@@ -186,10 +198,10 @@ def _is_section(self, tag):
186
198
"""
187
199
Check if `tag` is a section (linkeable header).
188
200
189
- The tag is a section if it's a ``h`` tag.
201
+ The tag is a section if it's a ``h`` or a ``header`` tag.
190
202
"""
191
- is_header_tag = re .match (r' h\d$' , tag .tag )
192
- return is_header_tag
203
+ is_h_tag = re .match (r" h\d$" , tag .tag )
204
+ return is_h_tag or tag . tag == "header"
193
205
194
206
def _parse_section_title (self , tag ):
195
207
"""
@@ -199,15 +211,7 @@ def _parse_section_title(self, tag):
199
211
200
212
- Get the id from the node itself.
201
213
- Get the id from the parent node.
202
-
203
- Additionally:
204
-
205
- - Removes permalink values
206
214
"""
207
- nodes_to_be_removed = tag .css ('.headerlink' )
208
- for node in nodes_to_be_removed :
209
- node .decompose ()
210
-
211
215
section_id = tag .attributes .get ('id' , '' )
212
216
if not section_id :
213
217
parent = tag .parent
@@ -328,6 +332,7 @@ def _process_content(self, page, content):
328
332
title = ""
329
333
sections = []
330
334
if body :
335
+ body = self ._clean_body (body )
331
336
title = self ._get_page_title (body , html ) or page
332
337
sections = self ._get_sections (title = title , body = body )
333
338
else :
@@ -417,7 +422,7 @@ def _process_fjson(self, fjson_path):
417
422
418
423
if 'body' in data :
419
424
try :
420
- body = HTMLParser (data ["body" ])
425
+ body = self . _clean_body ( HTMLParser (data ["body" ]) )
421
426
sections = self ._get_sections (title = title , body = body .body )
422
427
except Exception :
423
428
log .info ('Unable to index sections.' , path = fjson_path )
0 commit comments