Skip to content

Commit 0caf2b2

Browse files
author
CM Lubinski
committed
Split process_file into a few functions.
1 parent a779196 commit 0caf2b2

File tree

1 file changed

+54
-45
lines changed

1 file changed

+54
-45
lines changed

readthedocs/search/parse_json.py

Lines changed: 54 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,57 @@ def process_all_json_files(version, build_dir=True):
3838
return page_list
3939

4040

41+
def process_headers(data, filename):
42+
"""Read headers from toc data."""
43+
headers = []
44+
if 'toc' in data:
45+
for element in PyQuery(data['toc'])('a'):
46+
headers.append(recurse_while_none(element))
47+
if None in headers:
48+
log.info('Unable to index file headers for: %s', filename)
49+
return headers
50+
51+
52+
def generate_sections_from_pyquery(body):
53+
"""Given a pyquery object, generate section dicts for each section."""
54+
# Capture text inside h1 before the first h2
55+
h1_section = body('.section > h1')
56+
if h1_section:
57+
div = h1_section.parent()
58+
h1_title = h1_section.text().replace(u'¶', '').strip()
59+
h1_id = div.attr('id')
60+
h1_content = ""
61+
next_p = body('h1').next()
62+
while next_p:
63+
if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
64+
if 'section' in next_p[0].attrib['class']:
65+
break
66+
h1_content += "\n%s\n" % next_p.html()
67+
next_p = next_p.next()
68+
if h1_content:
69+
yield {
70+
'id': h1_id,
71+
'title': h1_title,
72+
'content': h1_content,
73+
}
74+
75+
# Capture text inside h2's
76+
section_list = body('.section > h2')
77+
for num in range(len(section_list)):
78+
div = section_list.eq(num).parent()
79+
header = section_list.eq(num)
80+
title = header.text().replace(u'¶', '').strip()
81+
section_id = div.attr('id')
82+
content = div.html()
83+
yield {
84+
'id': section_id,
85+
'title': title,
86+
'content': content,
87+
}
88+
log.debug("(Search Index) Section [%s:%s]: %s",
89+
section_id, title, content)
90+
91+
4192
def process_file(filename):
4293
"""Read a file from disk and parse it into a structured dict."""
4394
try:
@@ -47,61 +98,18 @@ def process_file(filename):
4798
log.info('Unable to index file: %s, error :%s', filename, e)
4899
return
49100
data = json.loads(file_contents)
50-
headers = []
51101
sections = []
52-
content = ''
53102
title = ''
54103
body_content = ''
55104
if 'current_page_name' in data:
56105
path = data['current_page_name']
57106
else:
58107
log.info('Unable to index file due to no name %s', filename)
59108
return None
60-
if 'toc' in data:
61-
for element in PyQuery(data['toc'])('a'):
62-
headers.append(recurse_while_none(element))
63-
if None in headers:
64-
log.info('Unable to index file headers for: %s', filename)
65109
if 'body' in data and len(data['body']):
66110
body = PyQuery(data['body'])
67111
body_content = body.text().replace(u'¶', '')
68-
# Capture text inside h1 before the first h2
69-
h1_section = body('.section > h1')
70-
if h1_section:
71-
div = h1_section.parent()
72-
h1_title = h1_section.text().replace(u'¶', '').strip()
73-
h1_id = div.attr('id')
74-
h1_content = ""
75-
next_p = body('h1').next()
76-
while next_p:
77-
if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
78-
if 'section' in next_p[0].attrib['class']:
79-
break
80-
h1_content += "\n%s\n" % next_p.html()
81-
next_p = next_p.next()
82-
if h1_content:
83-
sections.append({
84-
'id': h1_id,
85-
'title': h1_title,
86-
'content': h1_content,
87-
})
88-
89-
# Capture text inside h2's
90-
section_list = body('.section > h2')
91-
for num in range(len(section_list)):
92-
div = section_list.eq(num).parent()
93-
header = section_list.eq(num)
94-
title = header.text().replace(u'¶', '').strip()
95-
section_id = div.attr('id')
96-
content = div.html()
97-
sections.append({
98-
'id': section_id,
99-
'title': title,
100-
'content': content,
101-
})
102-
log.debug("(Search Index) Section [%s:%s]: %s",
103-
section_id, title, content)
104-
112+
sections.extend(generate_sections_from_pyquery(body))
105113
else:
106114
log.info('Unable to index content for: %s', filename)
107115
if 'title' in data:
@@ -111,7 +119,8 @@ def process_file(filename):
111119
else:
112120
log.info('Unable to index title for: %s', filename)
113121

114-
return {'headers': headers, 'content': body_content, 'path': path,
122+
return {'headers': process_headers(data, filename),
123+
'content': body_content, 'path': path,
115124
'title': title, 'sections': sections}
116125

117126

0 commit comments

Comments
 (0)