14
14
log = structlog .get_logger (__name__ )
15
15
16
16
17
- class BaseParser :
17
+ class GenericParser :
18
18
19
19
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
20
20
max_inner_documents = 10000
@@ -73,6 +73,7 @@ def _get_main_node(self, html):
73
73
- Try the first ``h1`` node and return its parent
74
74
Usually all sections are neighbors,
75
75
so they are children of the same parent node.
76
+ - Return the body element itself if all checks above fail.
76
77
"""
77
78
body = html .body
78
79
main_node = body .css_first ('[role=main]' )
@@ -85,11 +86,11 @@ def _get_main_node(self, html):
85
86
86
87
# TODO: this could be done in smarter way,
87
88
# checking for common parents between all h nodes.
88
- first_header = body .css_first ('h1' )
89
+ first_header = body .css_first ("h1" )
89
90
if first_header :
90
91
return first_header .parent
91
92
92
- return None
93
+ return body
93
94
94
95
def _parse_content (self , content ):
95
96
"""Converts all new line characters and multiple spaces to a single space."""
@@ -248,7 +249,7 @@ def _parse_section_content(self, tag, *, depth=0):
248
249
contents .append (content )
249
250
next_tag = next_tag .next
250
251
251
- return self ._parse_content ('' .join (contents )), section_found
252
+ return self ._parse_content ("" .join (contents )), section_found
252
253
253
254
def _is_code_section (self , tag ):
254
255
"""
@@ -307,10 +308,42 @@ def parse(self, page):
307
308
'domain_data': {},
308
309
}
309
310
"""
310
- raise NotImplementedError
311
+ try :
312
+ content = self ._get_page_content (page )
313
+ if content :
314
+ return self ._process_content (page , content )
315
+ except Exception :
316
+ log .info ("Failed to index page." , path = page , exc_info = True )
317
+ return {
318
+ "path" : page ,
319
+ "title" : "" ,
320
+ "sections" : [],
321
+ "domain_data" : {},
322
+ }
311
323
324
+ def _process_content (self , page , content ):
325
+ """Parses the content into a structured dict."""
326
+ html = HTMLParser (content )
327
+ body = self ._get_main_node (html )
328
+ title = ""
329
+ sections = []
330
+ if body :
331
+ title = self ._get_page_title (body , html ) or page
332
+ sections = self ._get_sections (title = title , body = body )
333
+ else :
334
+ log .info (
335
+ "Page doesn't look like it has valid content, skipping." ,
336
+ page = page ,
337
+ )
338
+ return {
339
+ "path" : page ,
340
+ "title" : title ,
341
+ "sections" : sections ,
342
+ "domain_data" : {},
343
+ }
312
344
313
- class SphinxParser (BaseParser ):
345
+
346
+ class SphinxParser (GenericParser ):
314
347
315
348
"""
316
349
Parser for Sphinx generated html pages.
@@ -384,7 +417,7 @@ def _process_fjson(self, fjson_path):
384
417
385
418
if 'body' in data :
386
419
try :
387
- body = HTMLParser (data [' body' ])
420
+ body = HTMLParser (data [" body" ])
388
421
sections = self ._get_sections (title = title , body = body .body )
389
422
except Exception :
390
423
log .info ('Unable to index sections.' , path = fjson_path )
@@ -506,57 +539,15 @@ def _parse_domain_tag(self, tag):
506
539
return docstring
507
540
508
541
509
- class MkDocsParser (BaseParser ):
542
+ class MkDocsParser (GenericParser ):
510
543
511
544
"""
512
545
MkDocs parser.
513
546
514
- Index from the json index file or directly from the html content.
547
+ Index using the json index file instead of the html content.
515
548
"""
516
549
517
550
def parse (self , page ):
518
- # Avoid circular import
519
- from readthedocs .projects .models import Feature
520
- if self .project .has_feature (Feature .INDEX_FROM_HTML_FILES ):
521
- return self .parse_from_html (page )
522
- return self .parse_from_index_file (page )
523
-
524
- def parse_from_html (self , page ):
525
- try :
526
- content = self ._get_page_content (page )
527
- if content :
528
- return self ._process_content (page , content )
529
- except Exception as e :
530
- log .info ('Failed to index page.' , path = page , exception = str (e ))
531
- return {
532
- 'path' : page ,
533
- 'title' : '' ,
534
- 'sections' : [],
535
- 'domain_data' : {},
536
- }
537
-
538
- def _process_content (self , page , content ):
539
- """Parses the content into a structured dict."""
540
- html = HTMLParser (content )
541
- body = self ._get_main_node (html )
542
- title = ""
543
- sections = []
544
- if body :
545
- title = self ._get_page_title (body , html ) or page
546
- sections = self ._get_sections (title = title , body = body )
547
- else :
548
- log .info (
549
- "Page doesn't look like it has valid content, skipping." ,
550
- page = page ,
551
- )
552
- return {
553
- 'path' : page ,
554
- 'title' : title ,
555
- 'sections' : sections ,
556
- 'domain_data' : {},
557
- }
558
-
559
- def parse_from_index_file (self , page ):
560
551
storage_path = self .project .get_storage_path (
561
552
type_ = 'html' ,
562
553
version_slug = self .version .slug ,
0 commit comments