@@ -38,6 +38,57 @@ def process_all_json_files(version, build_dir=True):
38
38
return page_list
39
39
40
40
41
+ def process_headers (data , filename ):
42
+ """Read headers from toc data."""
43
+ headers = []
44
+ if 'toc' in data :
45
+ for element in PyQuery (data ['toc' ])('a' ):
46
+ headers .append (recurse_while_none (element ))
47
+ if None in headers :
48
+ log .info ('Unable to index file headers for: %s' , filename )
49
+ return headers
50
+
51
+
52
+ def generate_sections_from_pyquery (body ):
53
+ """Given a pyquery object, generate section dicts for each section."""
54
+ # Capture text inside h1 before the first h2
55
+ h1_section = body ('.section > h1' )
56
+ if h1_section :
57
+ div = h1_section .parent ()
58
+ h1_title = h1_section .text ().replace (u'¶' , '' ).strip ()
59
+ h1_id = div .attr ('id' )
60
+ h1_content = ""
61
+ next_p = body ('h1' ).next ()
62
+ while next_p :
63
+ if next_p [0 ].tag == 'div' and 'class' in next_p [0 ].attrib :
64
+ if 'section' in next_p [0 ].attrib ['class' ]:
65
+ break
66
+ h1_content += "\n %s\n " % next_p .html ()
67
+ next_p = next_p .next ()
68
+ if h1_content :
69
+ yield {
70
+ 'id' : h1_id ,
71
+ 'title' : h1_title ,
72
+ 'content' : h1_content ,
73
+ }
74
+
75
+ # Capture text inside h2's
76
+ section_list = body ('.section > h2' )
77
+ for num in range (len (section_list )):
78
+ div = section_list .eq (num ).parent ()
79
+ header = section_list .eq (num )
80
+ title = header .text ().replace (u'¶' , '' ).strip ()
81
+ section_id = div .attr ('id' )
82
+ content = div .html ()
83
+ yield {
84
+ 'id' : section_id ,
85
+ 'title' : title ,
86
+ 'content' : content ,
87
+ }
88
+ log .debug ("(Search Index) Section [%s:%s]: %s" ,
89
+ section_id , title , content )
90
+
91
+
41
92
def process_file (filename ):
42
93
"""Read a file from disk and parse it into a structured dict."""
43
94
try :
@@ -47,61 +98,18 @@ def process_file(filename):
47
98
log .info ('Unable to index file: %s, error :%s' , filename , e )
48
99
return
49
100
data = json .loads (file_contents )
50
- headers = []
51
101
sections = []
52
- content = ''
53
102
title = ''
54
103
body_content = ''
55
104
if 'current_page_name' in data :
56
105
path = data ['current_page_name' ]
57
106
else :
58
107
log .info ('Unable to index file due to no name %s' , filename )
59
108
return None
60
- if 'toc' in data :
61
- for element in PyQuery (data ['toc' ])('a' ):
62
- headers .append (recurse_while_none (element ))
63
- if None in headers :
64
- log .info ('Unable to index file headers for: %s' , filename )
65
109
if 'body' in data and len (data ['body' ]):
66
110
body = PyQuery (data ['body' ])
67
111
body_content = body .text ().replace (u'¶' , '' )
68
- # Capture text inside h1 before the first h2
69
- h1_section = body ('.section > h1' )
70
- if h1_section :
71
- div = h1_section .parent ()
72
- h1_title = h1_section .text ().replace (u'¶' , '' ).strip ()
73
- h1_id = div .attr ('id' )
74
- h1_content = ""
75
- next_p = body ('h1' ).next ()
76
- while next_p :
77
- if next_p [0 ].tag == 'div' and 'class' in next_p [0 ].attrib :
78
- if 'section' in next_p [0 ].attrib ['class' ]:
79
- break
80
- h1_content += "\n %s\n " % next_p .html ()
81
- next_p = next_p .next ()
82
- if h1_content :
83
- sections .append ({
84
- 'id' : h1_id ,
85
- 'title' : h1_title ,
86
- 'content' : h1_content ,
87
- })
88
-
89
- # Capture text inside h2's
90
- section_list = body ('.section > h2' )
91
- for num in range (len (section_list )):
92
- div = section_list .eq (num ).parent ()
93
- header = section_list .eq (num )
94
- title = header .text ().replace (u'¶' , '' ).strip ()
95
- section_id = div .attr ('id' )
96
- content = div .html ()
97
- sections .append ({
98
- 'id' : section_id ,
99
- 'title' : title ,
100
- 'content' : content ,
101
- })
102
- log .debug ("(Search Index) Section [%s:%s]: %s" ,
103
- section_id , title , content )
104
-
112
+ sections .extend (generate_sections_from_pyquery (body ))
105
113
else :
106
114
log .info ('Unable to index content for: %s' , filename )
107
115
if 'title' in data :
@@ -111,7 +119,8 @@ def process_file(filename):
111
119
else :
112
120
log .info ('Unable to index title for: %s' , filename )
113
121
114
- return {'headers' : headers , 'content' : body_content , 'path' : path ,
122
+ return {'headers' : process_headers (data , filename ),
123
+ 'content' : body_content , 'path' : path ,
115
124
'title' : title , 'sections' : sections }
116
125
117
126
0 commit comments