1
1
# -*- coding: utf-8 -*-
2
+ """Functions related to converting content into dict/JSON structures."""
2
3
3
4
import codecs
4
5
import fnmatch
@@ -19,7 +20,7 @@ def process_all_json_files(version, build_dir=True):
19
20
full_path = version .project .get_production_media_path (
20
21
type_ = 'json' , version_slug = version .slug , include_file = False )
21
22
html_files = []
22
- for root , dirs , files in os .walk (full_path ):
23
+ for root , _ , files in os .walk (full_path ):
23
24
for filename in fnmatch .filter (files , '*.fjson' ):
24
25
if filename in ['search.fjson' , 'genindex.fjson' , 'py-modindex.fjson' ]:
25
26
continue
@@ -30,83 +31,96 @@ def process_all_json_files(version, build_dir=True):
30
31
result = process_file (filename )
31
32
if result :
32
33
page_list .append (result )
34
+ # we're unsure which exceptions can be raised
35
+ # pylint: disable=bare-except
33
36
except :
34
37
pass
35
38
return page_list
36
39
37
40
41
+ def process_headers (data , filename ):
42
+ """Read headers from toc data."""
43
+ headers = []
44
+ if 'toc' in data :
45
+ for element in PyQuery (data ['toc' ])('a' ):
46
+ headers .append (recurse_while_none (element ))
47
+ if None in headers :
48
+ log .info ('Unable to index file headers for: %s' , filename )
49
+ return headers
50
+
51
+
52
+ def generate_sections_from_pyquery (body ):
53
+ """Given a pyquery object, generate section dicts for each section."""
54
+ # Capture text inside h1 before the first h2
55
+ h1_section = body ('.section > h1' )
56
+ if h1_section :
57
+ div = h1_section .parent ()
58
+ h1_title = h1_section .text ().replace (u'¶' , '' ).strip ()
59
+ h1_id = div .attr ('id' )
60
+ h1_content = ""
61
+ next_p = body ('h1' ).next ()
62
+ while next_p :
63
+ if next_p [0 ].tag == 'div' and 'class' in next_p [0 ].attrib :
64
+ if 'section' in next_p [0 ].attrib ['class' ]:
65
+ break
66
+ h1_content += "\n %s\n " % next_p .html ()
67
+ next_p = next_p .next ()
68
+ if h1_content :
69
+ yield {
70
+ 'id' : h1_id ,
71
+ 'title' : h1_title ,
72
+ 'content' : h1_content ,
73
+ }
74
+
75
+ # Capture text inside h2's
76
+ section_list = body ('.section > h2' )
77
+ for num in range (len (section_list )):
78
+ div = section_list .eq (num ).parent ()
79
+ header = section_list .eq (num )
80
+ title = header .text ().replace (u'¶' , '' ).strip ()
81
+ section_id = div .attr ('id' )
82
+ content = div .html ()
83
+ yield {
84
+ 'id' : section_id ,
85
+ 'title' : title ,
86
+ 'content' : content ,
87
+ }
88
+ log .debug ("(Search Index) Section [%s:%s]: %s" ,
89
+ section_id , title , content )
90
+
91
+
38
92
def process_file (filename ):
93
+ """Read a file from disk and parse it into a structured dict."""
39
94
try :
40
95
with codecs .open (filename , encoding = 'utf-8' , mode = 'r' ) as f :
41
96
file_contents = f .read ()
42
97
except IOError as e :
43
- log .info ('Unable to index file: %s, error :%s' % ( filename , e ) )
98
+ log .info ('Unable to index file: %s, error :%s' , filename , e )
44
99
return
45
100
data = json .loads (file_contents )
46
- headers = []
47
101
sections = []
48
- content = ''
49
102
title = ''
50
103
body_content = ''
51
104
if 'current_page_name' in data :
52
105
path = data ['current_page_name' ]
53
106
else :
54
- log .info ('Unable to index file due to no name %s' % filename )
107
+ log .info ('Unable to index file due to no name %s' , filename )
55
108
return None
56
- if 'toc' in data :
57
- for element in PyQuery (data ['toc' ])('a' ):
58
- headers .append (recurse_while_none (element ))
59
- if None in headers :
60
- log .info ('Unable to index file headers for: %s' % filename )
61
109
if 'body' in data and len (data ['body' ]):
62
110
body = PyQuery (data ['body' ])
63
111
body_content = body .text ().replace (u'¶' , '' )
64
- # Capture text inside h1 before the first h2
65
- h1_section = body ('.section > h1' )
66
- if h1_section :
67
- div = h1_section .parent ()
68
- h1_title = h1_section .text ().replace (u'¶' , '' ).strip ()
69
- h1_id = div .attr ('id' )
70
- h1_content = ""
71
- next_p = body ('h1' ).next ()
72
- while next_p :
73
- if next_p [0 ].tag == 'div' and 'class' in next_p [0 ].attrib :
74
- if 'section' in next_p [0 ].attrib ['class' ]:
75
- break
76
- h1_content += "\n %s\n " % next_p .html ()
77
- next_p = next_p .next ()
78
- if h1_content :
79
- sections .append ({
80
- 'id' : h1_id ,
81
- 'title' : h1_title ,
82
- 'content' : h1_content ,
83
- })
84
-
85
- # Capture text inside h2's
86
- section_list = body ('.section > h2' )
87
- for num in range (len (section_list )):
88
- div = section_list .eq (num ).parent ()
89
- header = section_list .eq (num )
90
- title = header .text ().replace (u'¶' , '' ).strip ()
91
- section_id = div .attr ('id' )
92
- content = div .html ()
93
- sections .append ({
94
- 'id' : section_id ,
95
- 'title' : title ,
96
- 'content' : content ,
97
- })
98
- log .debug ("(Search Index) Section [%s:%s]: %s" % (section_id , title , content ))
99
-
112
+ sections .extend (generate_sections_from_pyquery (body ))
100
113
else :
101
- log .info ('Unable to index content for: %s' % filename )
114
+ log .info ('Unable to index content for: %s' , filename )
102
115
if 'title' in data :
103
116
title = data ['title' ]
104
117
if title .startswith ('<' ):
105
118
title = PyQuery (data ['title' ]).text ()
106
119
else :
107
- log .info ('Unable to index title for: %s' % filename )
120
+ log .info ('Unable to index title for: %s' , filename )
108
121
109
- return {'headers' : headers , 'content' : body_content , 'path' : path ,
122
+ return {'headers' : process_headers (data , filename ),
123
+ 'content' : body_content , 'path' : path ,
110
124
'title' : title , 'sections' : sections }
111
125
112
126
0 commit comments