Skip to content

Commit 47b62b2

Browse files
cmc333333agjohnson
authored andcommitted
Fixup linting issues in the search module (readthedocs#2884)
* Start of linting for search app. * Combine several vars into a namedtuple. Flake8 warned about using too many local vars. Combine that input into a namedtuple and access the attributes from it. * Remove unused argument. * Split parse_sections function into two. This uses `extend` as the logic currently allows documentation_type to include both sphinx and mkdocs. If that doesn't make sense, let's simplify. * Split process_file into a few functions.
1 parent c2ade9e commit 47b62b2

File tree

7 files changed

+229
-184
lines changed

7 files changed

+229
-184
lines changed

prospector-more.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ ignore-paths:
66
- core/
77
- donate/
88
- restapi/
9-
- search/
109

1110
pylint:
1211
options:

readthedocs/search/indexes.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ def search(self, body, **kwargs):
208208

209209
class ProjectIndex(Index):
210210

211+
"""Search index configuration for Projects"""
212+
211213
_type = 'project'
212214

213215
def get_mapping(self):
@@ -258,6 +260,8 @@ def extract_document(self, data):
258260

259261
class PageIndex(Index):
260262

263+
"""Search index configuration for Pages"""
264+
261265
_type = 'page'
262266
_parent = 'project'
263267

@@ -304,6 +308,8 @@ def extract_document(self, data):
304308

305309
class SectionIndex(Index):
306310

311+
"""Search index configuration for Sections"""
312+
307313
_type = 'section'
308314
_parent = 'page'
309315

readthedocs/search/lib.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""Utilities related to searching Elastic."""
12
from pprint import pprint
23

34
from django.conf import settings
@@ -12,7 +13,7 @@
1213

1314

1415
def search_project(request, query, language=None):
15-
16+
"""Search index for projects matching query"""
1617
body = {
1718
"query": {
1819
"bool": {

readthedocs/search/parse_json.py

Lines changed: 63 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- coding: utf-8 -*-
2+
"""Functions related to converting content into dict/JSON structures."""
23

34
import codecs
45
import fnmatch
@@ -19,7 +20,7 @@ def process_all_json_files(version, build_dir=True):
1920
full_path = version.project.get_production_media_path(
2021
type_='json', version_slug=version.slug, include_file=False)
2122
html_files = []
22-
for root, dirs, files in os.walk(full_path):
23+
for root, _, files in os.walk(full_path):
2324
for filename in fnmatch.filter(files, '*.fjson'):
2425
if filename in ['search.fjson', 'genindex.fjson', 'py-modindex.fjson']:
2526
continue
@@ -30,83 +31,96 @@ def process_all_json_files(version, build_dir=True):
3031
result = process_file(filename)
3132
if result:
3233
page_list.append(result)
34+
# we're unsure which exceptions can be raised
35+
# pylint: disable=bare-except
3336
except:
3437
pass
3538
return page_list
3639

3740

41+
def process_headers(data, filename):
42+
"""Read headers from toc data."""
43+
headers = []
44+
if 'toc' in data:
45+
for element in PyQuery(data['toc'])('a'):
46+
headers.append(recurse_while_none(element))
47+
if None in headers:
48+
log.info('Unable to index file headers for: %s', filename)
49+
return headers
50+
51+
52+
def generate_sections_from_pyquery(body):
53+
"""Given a pyquery object, generate section dicts for each section."""
54+
# Capture text inside h1 before the first h2
55+
h1_section = body('.section > h1')
56+
if h1_section:
57+
div = h1_section.parent()
58+
h1_title = h1_section.text().replace(u'¶', '').strip()
59+
h1_id = div.attr('id')
60+
h1_content = ""
61+
next_p = body('h1').next()
62+
while next_p:
63+
if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
64+
if 'section' in next_p[0].attrib['class']:
65+
break
66+
h1_content += "\n%s\n" % next_p.html()
67+
next_p = next_p.next()
68+
if h1_content:
69+
yield {
70+
'id': h1_id,
71+
'title': h1_title,
72+
'content': h1_content,
73+
}
74+
75+
# Capture text inside h2's
76+
section_list = body('.section > h2')
77+
for num in range(len(section_list)):
78+
div = section_list.eq(num).parent()
79+
header = section_list.eq(num)
80+
title = header.text().replace(u'¶', '').strip()
81+
section_id = div.attr('id')
82+
content = div.html()
83+
yield {
84+
'id': section_id,
85+
'title': title,
86+
'content': content,
87+
}
88+
log.debug("(Search Index) Section [%s:%s]: %s",
89+
section_id, title, content)
90+
91+
3892
def process_file(filename):
93+
"""Read a file from disk and parse it into a structured dict."""
3994
try:
4095
with codecs.open(filename, encoding='utf-8', mode='r') as f:
4196
file_contents = f.read()
4297
except IOError as e:
43-
log.info('Unable to index file: %s, error :%s' % (filename, e))
98+
log.info('Unable to index file: %s, error :%s', filename, e)
4499
return
45100
data = json.loads(file_contents)
46-
headers = []
47101
sections = []
48-
content = ''
49102
title = ''
50103
body_content = ''
51104
if 'current_page_name' in data:
52105
path = data['current_page_name']
53106
else:
54-
log.info('Unable to index file due to no name %s' % filename)
107+
log.info('Unable to index file due to no name %s', filename)
55108
return None
56-
if 'toc' in data:
57-
for element in PyQuery(data['toc'])('a'):
58-
headers.append(recurse_while_none(element))
59-
if None in headers:
60-
log.info('Unable to index file headers for: %s' % filename)
61109
if 'body' in data and len(data['body']):
62110
body = PyQuery(data['body'])
63111
body_content = body.text().replace(u'¶', '')
64-
# Capture text inside h1 before the first h2
65-
h1_section = body('.section > h1')
66-
if h1_section:
67-
div = h1_section.parent()
68-
h1_title = h1_section.text().replace(u'¶', '').strip()
69-
h1_id = div.attr('id')
70-
h1_content = ""
71-
next_p = body('h1').next()
72-
while next_p:
73-
if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
74-
if 'section' in next_p[0].attrib['class']:
75-
break
76-
h1_content += "\n%s\n" % next_p.html()
77-
next_p = next_p.next()
78-
if h1_content:
79-
sections.append({
80-
'id': h1_id,
81-
'title': h1_title,
82-
'content': h1_content,
83-
})
84-
85-
# Capture text inside h2's
86-
section_list = body('.section > h2')
87-
for num in range(len(section_list)):
88-
div = section_list.eq(num).parent()
89-
header = section_list.eq(num)
90-
title = header.text().replace(u'¶', '').strip()
91-
section_id = div.attr('id')
92-
content = div.html()
93-
sections.append({
94-
'id': section_id,
95-
'title': title,
96-
'content': content,
97-
})
98-
log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, title, content))
99-
112+
sections.extend(generate_sections_from_pyquery(body))
100113
else:
101-
log.info('Unable to index content for: %s' % filename)
114+
log.info('Unable to index content for: %s', filename)
102115
if 'title' in data:
103116
title = data['title']
104117
if title.startswith('<'):
105118
title = PyQuery(data['title']).text()
106119
else:
107-
log.info('Unable to index title for: %s' % filename)
120+
log.info('Unable to index title for: %s', filename)
108121

109-
return {'headers': headers, 'content': body_content, 'path': path,
122+
return {'headers': process_headers(data, filename),
123+
'content': body_content, 'path': path,
110124
'title': title, 'sections': sections}
111125

112126

readthedocs/search/signals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""We define custom Django signals to trigger before executing searches."""
12
import django.dispatch
23

34
before_project_search = django.dispatch.Signal(providing_args=["body"])

0 commit comments

Comments
 (0)