Merge pull request #7134 from readthedocs/fix-permalink

stsewd · web-flow · commit 4c6fe4ba6977 · 2020-05-27T13:44:49.000-05:00
Search: don't index permalinks
diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -61,7 +61,7 @@ def generate_page_sections(page_title, body, fjson_storage_path):
     for head_level in range(1, 7):
         tags = body.css(f'.section > h{head_level}')
         for tag in tags:
-            title = tag.text().replace('¶', '').strip()
+            title = _parse_title(tag)
 
             div = tag.parent
             section_id = div.attributes.get('id', '')
@@ -196,21 +196,29 @@ def _get_text_for_domain_data(desc):
     return docstrings
 
 
-def parse_content(content, remove_first_line=False):
+def parse_content(content):
     """Removes new line characters and ¶."""
     content = content.replace('¶', '').strip()
     content = content.split('\n')
 
-    # removing the starting text of each
-    if remove_first_line and len(content) > 1:
-        content = content[1:]
-
     # Convert all new lines to " "
     content = (text.strip() for text in content)
     content = ' '.join(text for text in content if text)
     return content
 
 
+def _parse_title(tag):
+    """
+    Parses a Sphinx title tag.
+
+    - Removes the permalink value
+    """
+    nodes_to_be_removed = tag.css('a.headerlink')
+    for node in nodes_to_be_removed:
+        node.decompose()
+    return tag.text().strip()
+
+
 def process_mkdocs_index_file(json_storage_path, page):
     """Reads the json index file and parses it into a structured dict."""
     log.debug('Processing JSON index file: %s', json_storage_path)
diff --git a/readthedocs/search/tests/data/sphinx/in/page.html b/readthedocs/search/tests/data/sphinx/in/page.html
@@ -11,7 +11,7 @@ <h1>Title One<a class="headerlink" href="#title-one" title="Permalink to this he
   <p>This is another H1 title.</p>
 
   <div class="section" id="sub-title-one">
-    <h2>Sub-title one<a class="headerlink" href="#sub-title-one" title="Permalink to this headline">¶</a></h2>
+    <h2>Sub-title one<a class="headerlink" href="#sub-title-one" title="Permalink to this headline">§</a></h2>
     <p>Sub title</p>
 
     <div class="section" id="subsub-title">