14
14
log = structlog .get_logger (__name__ )
15
15
16
16
17
- class BaseParser :
17
+ class GenericParser :
18
18
19
19
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
20
20
max_inner_documents = 10000
@@ -83,13 +83,7 @@ def _get_main_node(self, html):
83
83
if main_node :
84
84
return main_node
85
85
86
- # TODO: this could be done in smarter way,
87
- # checking for common parents between all h nodes.
88
- first_header = body .css_first ('h1' )
89
- if first_header :
90
- return first_header .parent
91
-
92
- return None
86
+ return body
93
87
94
88
def _parse_content (self , content ):
95
89
"""Converts all new line characters and multiple spaces to a single space."""
@@ -109,8 +103,6 @@ def _parse_sections(self, title, body):
109
103
We can have pages that have content before the first title or that don't have a title,
110
104
we index that content first under the title of the original page.
111
105
"""
112
- body = self ._clean_body (body )
113
-
114
106
# Index content for pages that don't start with a title.
115
107
# We check for sections till 3 levels to avoid indexing all the content
116
108
# in this step.
@@ -248,7 +240,7 @@ def _parse_section_content(self, tag, *, depth=0):
248
240
contents .append (content )
249
241
next_tag = next_tag .next
250
242
251
- return self ._parse_content ('' .join (contents )), section_found
243
+ return self ._parse_content ("" .join (contents )), section_found
252
244
253
245
def _is_code_section (self , tag ):
254
246
"""
@@ -307,10 +299,42 @@ def parse(self, page):
307
299
'domain_data': {},
308
300
}
309
301
"""
310
- raise NotImplementedError
302
+ try :
303
+ content = self ._get_page_content (page )
304
+ if content :
305
+ return self ._process_content (page , content )
306
+ except Exception as e :
307
+ log .info ("Failed to index page." , path = page , exception = str (e ))
308
+ return {
309
+ "path" : page ,
310
+ "title" : "" ,
311
+ "sections" : [],
312
+ "domain_data" : {},
313
+ }
314
+
315
+ def _process_content (self , page , content ):
316
+ """Parses the content into a structured dict."""
317
+ html = self ._clean_body (HTMLParser (content ))
318
+ body = self ._get_main_node (html )
319
+ title = ""
320
+ sections = []
321
+ if body :
322
+ title = self ._get_page_title (body , html ) or page
323
+ sections = self ._get_sections (title = title , body = body )
324
+ else :
325
+ log .info (
326
+ "Page doesn't look like it has valid content, skipping." ,
327
+ page = page ,
328
+ )
329
+ return {
330
+ "path" : page ,
331
+ "title" : title ,
332
+ "sections" : sections ,
333
+ "domain_data" : {},
334
+ }
311
335
312
336
313
- class SphinxParser (BaseParser ):
337
+ class SphinxParser (GenericParser ):
314
338
315
339
"""
316
340
Parser for Sphinx generated html pages.
@@ -384,7 +408,7 @@ def _process_fjson(self, fjson_path):
384
408
385
409
if 'body' in data :
386
410
try :
387
- body = HTMLParser (data [' body' ] )
411
+ body = self . _clean_body ( HTMLParser (data [" body" ]) )
388
412
sections = self ._get_sections (title = title , body = body .body )
389
413
except Exception :
390
414
log .info ('Unable to index sections.' , path = fjson_path )
@@ -506,57 +530,15 @@ def _parse_domain_tag(self, tag):
506
530
return docstring
507
531
508
532
509
- class MkDocsParser (BaseParser ):
533
+ class MkDocsParser (GenericParser ):
510
534
511
535
"""
512
536
MkDocs parser.
513
537
514
- Index from the json index file or directly from the html content.
538
+ Index using the json index file instead of the html content.
515
539
"""
516
540
517
541
def parse (self , page ):
518
- # Avoid circular import
519
- from readthedocs .projects .models import Feature
520
- if self .project .has_feature (Feature .INDEX_FROM_HTML_FILES ):
521
- return self .parse_from_html (page )
522
- return self .parse_from_index_file (page )
523
-
524
- def parse_from_html (self , page ):
525
- try :
526
- content = self ._get_page_content (page )
527
- if content :
528
- return self ._process_content (page , content )
529
- except Exception as e :
530
- log .info ('Failed to index page.' , path = page , exception = str (e ))
531
- return {
532
- 'path' : page ,
533
- 'title' : '' ,
534
- 'sections' : [],
535
- 'domain_data' : {},
536
- }
537
-
538
- def _process_content (self , page , content ):
539
- """Parses the content into a structured dict."""
540
- html = HTMLParser (content )
541
- body = self ._get_main_node (html )
542
- title = ""
543
- sections = []
544
- if body :
545
- title = self ._get_page_title (body , html ) or page
546
- sections = self ._get_sections (title = title , body = body )
547
- else :
548
- log .info (
549
- "Page doesn't look like it has valid content, skipping." ,
550
- page = page ,
551
- )
552
- return {
553
- 'path' : page ,
554
- 'title' : title ,
555
- 'sections' : sections ,
556
- 'domain_data' : {},
557
- }
558
-
559
- def parse_from_index_file (self , page ):
560
542
storage_path = self .project .get_storage_path (
561
543
type_ = 'html' ,
562
544
version_slug = self .version .slug ,
0 commit comments