14
14
log = structlog .get_logger (__name__ )
15
15
16
16
17
- class BaseParser :
17
+ class GenericParser :
18
18
19
19
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
20
20
max_inner_documents = 10000
@@ -73,6 +73,7 @@ def _get_main_node(self, html):
73
73
- Try the first ``h1`` node and return its parent
74
74
Usually all sections are neighbors,
75
75
so they are children of the same parent node.
76
+ - Return the body element itself if all checks above fail.
76
77
"""
77
78
body = html .body
78
79
main_node = body .css_first ('[role=main]' )
@@ -85,11 +86,11 @@ def _get_main_node(self, html):
85
86
86
87
# TODO: this could be done in smarter way,
87
88
# checking for common parents between all h nodes.
88
- first_header = body .css_first ('h1' )
89
+ first_header = body .css_first ("h1" )
89
90
if first_header :
90
91
return first_header .parent
91
92
92
- return None
93
+ return body
93
94
94
95
def _parse_content (self , content ):
95
96
"""Converts all new line characters and multiple spaces to a single space."""
@@ -109,8 +110,6 @@ def _parse_sections(self, title, body):
109
110
We can have pages that have content before the first title or that don't have a title,
110
111
we index that content first under the title of the original page.
111
112
"""
112
- body = self ._clean_body (body )
113
-
114
113
# Index content for pages that don't start with a title.
115
114
# We check for sections till 3 levels to avoid indexing all the content
116
115
# in this step.
@@ -248,7 +247,7 @@ def _parse_section_content(self, tag, *, depth=0):
248
247
contents .append (content )
249
248
next_tag = next_tag .next
250
249
251
- return self ._parse_content ('' .join (contents )), section_found
250
+ return self ._parse_content ("" .join (contents )), section_found
252
251
253
252
def _is_code_section (self , tag ):
254
253
"""
@@ -307,10 +306,42 @@ def parse(self, page):
307
306
'domain_data': {},
308
307
}
309
308
"""
310
- raise NotImplementedError
309
+ try :
310
+ content = self ._get_page_content (page )
311
+ if content :
312
+ return self ._process_content (page , content )
313
+ except Exception as e :
314
+ log .info ("Failed to index page." , path = page , exception = str (e ))
315
+ return {
316
+ "path" : page ,
317
+ "title" : "" ,
318
+ "sections" : [],
319
+ "domain_data" : {},
320
+ }
311
321
322
+ def _process_content (self , page , content ):
323
+ """Parses the content into a structured dict."""
324
+ html = self ._clean_body (HTMLParser (content ))
325
+ body = self ._get_main_node (html )
326
+ title = ""
327
+ sections = []
328
+ if body :
329
+ title = self ._get_page_title (body , html ) or page
330
+ sections = self ._get_sections (title = title , body = body )
331
+ else :
332
+ log .info (
333
+ "Page doesn't look like it has valid content, skipping." ,
334
+ page = page ,
335
+ )
336
+ return {
337
+ "path" : page ,
338
+ "title" : title ,
339
+ "sections" : sections ,
340
+ "domain_data" : {},
341
+ }
312
342
313
- class SphinxParser (BaseParser ):
343
+
344
+ class SphinxParser (GenericParser ):
314
345
315
346
"""
316
347
Parser for Sphinx generated html pages.
@@ -384,7 +415,7 @@ def _process_fjson(self, fjson_path):
384
415
385
416
if 'body' in data :
386
417
try :
387
- body = HTMLParser (data [' body' ] )
418
+ body = self . _clean_body ( HTMLParser (data [" body" ]) )
388
419
sections = self ._get_sections (title = title , body = body .body )
389
420
except Exception :
390
421
log .info ('Unable to index sections.' , path = fjson_path )
@@ -506,57 +537,15 @@ def _parse_domain_tag(self, tag):
506
537
return docstring
507
538
508
539
509
- class MkDocsParser (BaseParser ):
540
+ class MkDocsParser (GenericParser ):
510
541
511
542
"""
512
543
MkDocs parser.
513
544
514
- Index from the json index file or directly from the html content.
545
+ Index using the json index file instead of the html content.
515
546
"""
516
547
517
548
def parse (self , page ):
518
- # Avoid circular import
519
- from readthedocs .projects .models import Feature
520
- if self .project .has_feature (Feature .INDEX_FROM_HTML_FILES ):
521
- return self .parse_from_html (page )
522
- return self .parse_from_index_file (page )
523
-
524
- def parse_from_html (self , page ):
525
- try :
526
- content = self ._get_page_content (page )
527
- if content :
528
- return self ._process_content (page , content )
529
- except Exception as e :
530
- log .info ('Failed to index page.' , path = page , exception = str (e ))
531
- return {
532
- 'path' : page ,
533
- 'title' : '' ,
534
- 'sections' : [],
535
- 'domain_data' : {},
536
- }
537
-
538
- def _process_content (self , page , content ):
539
- """Parses the content into a structured dict."""
540
- html = HTMLParser (content )
541
- body = self ._get_main_node (html )
542
- title = ""
543
- sections = []
544
- if body :
545
- title = self ._get_page_title (body , html ) or page
546
- sections = self ._get_sections (title = title , body = body )
547
- else :
548
- log .info (
549
- "Page doesn't look like it has valid content, skipping." ,
550
- page = page ,
551
- )
552
- return {
553
- 'path' : page ,
554
- 'title' : title ,
555
- 'sections' : sections ,
556
- 'domain_data' : {},
557
- }
558
-
559
- def parse_from_index_file (self , page ):
560
549
storage_path = self .project .get_storage_path (
561
550
type_ = 'html' ,
562
551
version_slug = self .version .slug ,
0 commit comments