Split process_file into a few functions.

CM Lubinski · CM Lubinski · commit 0caf2b26d5a0 · 2017-05-22T16:49:35.000-07:00
diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -38,6 +38,57 @@ def process_all_json_files(version, build_dir=True):
     return page_list
 
 
+def process_headers(data, filename):
+    """Read headers from toc data."""
+    headers = []
+    if 'toc' in data:
+        for element in PyQuery(data['toc'])('a'):
+            headers.append(recurse_while_none(element))
+        if None in headers:
+            log.info('Unable to index file headers for: %s', filename)
+    return headers
+
+
+def generate_sections_from_pyquery(body):
+    """Given a pyquery object, generate section dicts for each section."""
+    # Capture text inside h1 before the first h2
+    h1_section = body('.section > h1')
+    if h1_section:
+        div = h1_section.parent()
+        h1_title = h1_section.text().replace(u'¶', '').strip()
+        h1_id = div.attr('id')
+        h1_content = ""
+        next_p = body('h1').next()
+        while next_p:
+            if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
+                if 'section' in next_p[0].attrib['class']:
+                    break
+            h1_content += "\n%s\n" % next_p.html()
+            next_p = next_p.next()
+        if h1_content:
+            yield {
+                'id': h1_id,
+                'title': h1_title,
+                'content': h1_content,
+            }
+
+    # Capture text inside h2's
+    section_list = body('.section > h2')
+    for num in range(len(section_list)):
+        div = section_list.eq(num).parent()
+        header = section_list.eq(num)
+        title = header.text().replace(u'¶', '').strip()
+        section_id = div.attr('id')
+        content = div.html()
+        yield {
+            'id': section_id,
+            'title': title,
+            'content': content,
+        }
+        log.debug("(Search Index) Section [%s:%s]: %s",
+                  section_id, title, content)
+
+
 def process_file(filename):
     """Read a file from disk and parse it into a structured dict."""
     try:
@@ -47,61 +98,18 @@ def process_file(filename):
         log.info('Unable to index file: %s, error :%s', filename, e)
         return
     data = json.loads(file_contents)
-    headers = []
     sections = []
-    content = ''
     title = ''
     body_content = ''
     if 'current_page_name' in data:
         path = data['current_page_name']
     else:
         log.info('Unable to index file due to no name %s', filename)
         return None
-    if 'toc' in data:
-        for element in PyQuery(data['toc'])('a'):
-            headers.append(recurse_while_none(element))
-        if None in headers:
-            log.info('Unable to index file headers for: %s', filename)
     if 'body' in data and len(data['body']):
         body = PyQuery(data['body'])
         body_content = body.text().replace(u'¶', '')
-        # Capture text inside h1 before the first h2
-        h1_section = body('.section > h1')
-        if h1_section:
-            div = h1_section.parent()
-            h1_title = h1_section.text().replace(u'¶', '').strip()
-            h1_id = div.attr('id')
-            h1_content = ""
-            next_p = body('h1').next()
-            while next_p:
-                if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
-                    if 'section' in next_p[0].attrib['class']:
-                        break
-                h1_content += "\n%s\n" % next_p.html()
-                next_p = next_p.next()
-            if h1_content:
-                sections.append({
-                    'id': h1_id,
-                    'title': h1_title,
-                    'content': h1_content,
-                })
-
-        # Capture text inside h2's
-        section_list = body('.section > h2')
-        for num in range(len(section_list)):
-            div = section_list.eq(num).parent()
-            header = section_list.eq(num)
-            title = header.text().replace(u'¶', '').strip()
-            section_id = div.attr('id')
-            content = div.html()
-            sections.append({
-                'id': section_id,
-                'title': title,
-                'content': content,
-            })
-            log.debug("(Search Index) Section [%s:%s]: %s",
-                      section_id, title, content)
-
+        sections.extend(generate_sections_from_pyquery(body))
     else:
         log.info('Unable to index content for: %s', filename)
     if 'title' in data:
@@ -111,7 +119,8 @@ def process_file(filename):
     else:
         log.info('Unable to index title for: %s', filename)
 
-    return {'headers': headers, 'content': body_content, 'path': path,
+    return {'headers': process_headers(data, filename),
+            'content': body_content, 'path': path,
             'title': title, 'sections': sections}