Fixup linting issues in the search module (readthedocs#2884)

cmc333333 · agjohnson · commit 47b62b225187 · 2017-05-25T21:27:15.000-07:00
* Start of linting for search app.

* Combine several vars into a namedtuple.

Flake8 warned about using too many local vars. Combine that input into a
namedtuple and access the attributes from it.

* Remove unused argument.

* Split parse_sections function into two.

This uses `extend` as the logic currently allows documentation_type to include
both sphinx and mkdocs. If that doesn't make sense, let's simplify.

* Split process_file into a few functions.
diff --git a/prospector-more.yml b/prospector-more.yml
@@ -6,7 +6,6 @@ ignore-paths:
   - core/
   - donate/
   - restapi/
-  - search/
 
 pylint:
   options:
diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py
@@ -208,6 +208,8 @@ def search(self, body, **kwargs):
 
 class ProjectIndex(Index):
 
+    """Search index configuration for Projects"""
+
     _type = 'project'
 
     def get_mapping(self):
@@ -258,6 +260,8 @@ def extract_document(self, data):
 
 class PageIndex(Index):
 
+    """Search index configuration for Pages"""
+
     _type = 'page'
     _parent = 'project'
 
@@ -304,6 +308,8 @@ def extract_document(self, data):
 
 class SectionIndex(Index):
 
+    """Search index configuration for Sections"""
+
     _type = 'section'
     _parent = 'page'
 
diff --git a/readthedocs/search/lib.py b/readthedocs/search/lib.py
@@ -1,3 +1,4 @@
+"""Utilities related to searching Elastic."""
 from pprint import pprint
 
 from django.conf import settings
@@ -12,7 +13,7 @@
 
 
 def search_project(request, query, language=None):
-
+    """Search index for projects matching query"""
     body = {
         "query": {
             "bool": {
diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+"""Functions related to converting content into dict/JSON structures."""
 
 import codecs
 import fnmatch
@@ -19,7 +20,7 @@ def process_all_json_files(version, build_dir=True):
         full_path = version.project.get_production_media_path(
             type_='json', version_slug=version.slug, include_file=False)
     html_files = []
-    for root, dirs, files in os.walk(full_path):
+    for root, _, files in os.walk(full_path):
         for filename in fnmatch.filter(files, '*.fjson'):
             if filename in ['search.fjson', 'genindex.fjson', 'py-modindex.fjson']:
                 continue
@@ -30,83 +31,96 @@ def process_all_json_files(version, build_dir=True):
             result = process_file(filename)
             if result:
                 page_list.append(result)
+        # we're unsure which exceptions can be raised
+        # pylint: disable=bare-except
         except:
             pass
     return page_list
 
 
+def process_headers(data, filename):
+    """Read headers from toc data."""
+    headers = []
+    if 'toc' in data:
+        for element in PyQuery(data['toc'])('a'):
+            headers.append(recurse_while_none(element))
+        if None in headers:
+            log.info('Unable to index file headers for: %s', filename)
+    return headers
+
+
+def generate_sections_from_pyquery(body):
+    """Given a pyquery object, generate section dicts for each section."""
+    # Capture text inside h1 before the first h2
+    h1_section = body('.section > h1')
+    if h1_section:
+        div = h1_section.parent()
+        h1_title = h1_section.text().replace(u'¶', '').strip()
+        h1_id = div.attr('id')
+        h1_content = ""
+        next_p = body('h1').next()
+        while next_p:
+            if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
+                if 'section' in next_p[0].attrib['class']:
+                    break
+            h1_content += "\n%s\n" % next_p.html()
+            next_p = next_p.next()
+        if h1_content:
+            yield {
+                'id': h1_id,
+                'title': h1_title,
+                'content': h1_content,
+            }
+
+    # Capture text inside h2's
+    section_list = body('.section > h2')
+    for num in range(len(section_list)):
+        div = section_list.eq(num).parent()
+        header = section_list.eq(num)
+        title = header.text().replace(u'¶', '').strip()
+        section_id = div.attr('id')
+        content = div.html()
+        yield {
+            'id': section_id,
+            'title': title,
+            'content': content,
+        }
+        log.debug("(Search Index) Section [%s:%s]: %s",
+                  section_id, title, content)
+
+
 def process_file(filename):
+    """Read a file from disk and parse it into a structured dict."""
     try:
         with codecs.open(filename, encoding='utf-8', mode='r') as f:
             file_contents = f.read()
     except IOError as e:
-        log.info('Unable to index file: %s, error :%s' % (filename, e))
+        log.info('Unable to index file: %s, error :%s', filename, e)
         return
     data = json.loads(file_contents)
-    headers = []
     sections = []
-    content = ''
     title = ''
     body_content = ''
     if 'current_page_name' in data:
         path = data['current_page_name']
     else:
-        log.info('Unable to index file due to no name %s' % filename)
+        log.info('Unable to index file due to no name %s', filename)
         return None
-    if 'toc' in data:
-        for element in PyQuery(data['toc'])('a'):
-            headers.append(recurse_while_none(element))
-        if None in headers:
-            log.info('Unable to index file headers for: %s' % filename)
     if 'body' in data and len(data['body']):
         body = PyQuery(data['body'])
         body_content = body.text().replace(u'¶', '')
-        # Capture text inside h1 before the first h2
-        h1_section = body('.section > h1')
-        if h1_section:
-            div = h1_section.parent()
-            h1_title = h1_section.text().replace(u'¶', '').strip()
-            h1_id = div.attr('id')
-            h1_content = ""
-            next_p = body('h1').next()
-            while next_p:
-                if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
-                    if 'section' in next_p[0].attrib['class']:
-                        break
-                h1_content += "\n%s\n" % next_p.html()
-                next_p = next_p.next()
-            if h1_content:
-                sections.append({
-                    'id': h1_id,
-                    'title': h1_title,
-                    'content': h1_content,
-                })
-
-        # Capture text inside h2's
-        section_list = body('.section > h2')
-        for num in range(len(section_list)):
-            div = section_list.eq(num).parent()
-            header = section_list.eq(num)
-            title = header.text().replace(u'¶', '').strip()
-            section_id = div.attr('id')
-            content = div.html()
-            sections.append({
-                'id': section_id,
-                'title': title,
-                'content': content,
-            })
-            log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, title, content))
-
+        sections.extend(generate_sections_from_pyquery(body))
     else:
-        log.info('Unable to index content for: %s' % filename)
+        log.info('Unable to index content for: %s', filename)
     if 'title' in data:
         title = data['title']
         if title.startswith('<'):
             title = PyQuery(data['title']).text()
     else:
-        log.info('Unable to index title for: %s' % filename)
+        log.info('Unable to index title for: %s', filename)
 
-    return {'headers': headers, 'content': body_content, 'path': path,
+    return {'headers': process_headers(data, filename),
+            'content': body_content, 'path': path,
             'title': title, 'sections': sections}
 
 
diff --git a/readthedocs/search/signals.py b/readthedocs/search/signals.py
@@ -1,3 +1,4 @@
+"""We define custom Django signals to trigger before executing searches."""
 import django.dispatch
 
 before_project_search = django.dispatch.Signal(providing_args=["body"])
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
diff --git a/readthedocs/search/views.py b/readthedocs/search/views.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+"""We define custom Django signals to trigger before executing searches."""`
`1`	`2`	`import django.dispatch`
`2`	`3`
`3`	`4`	`before_project_search = django.dispatch.Signal(providing_args=["body"])`