Search: improve parser (#7233)

stsewd · web-flow · commit 46381677dfbf · 2020-08-17T14:00:48.000-05:00
* Search: improve parser This will make the parser more general and match #7232 (also, one bug fix). - Try the main tag before trying the first h1 - Always inspect all headers till 2 levels (this removes the need for the special case from Sphinx, where the h tag is inside a div) - `_parse_content` now not only removes all new line chars, but it also reduces multiple spaces into one. - Remove elements with the search role in addition to the navigation role. - The headerlink class doesn't need to be inside an `a` tag. - Fix bug where calling .text() over a text node will return empty. (I was able to catch this one now that we are checking till 2 levels) * Increase depth Now that we prioritize the main tag as main node, the main node from the mkdocs material theme is more wide. * Strip spaces
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -66,6 +66,7 @@ def _get_main_node(self, html):
 
         - Try with a tag with the ``main`` role.
           This role is used by several static sites and themes.
+        - Try the ``main`` tag.
         - Try the first ``h1`` node and return its parent
           Usually all sections are neighbors,
           so they are children of the same parent node.
@@ -75,6 +76,10 @@ def _get_main_node(self, html):
         if main_node:
             return main_node
 
+        main_node = body.css_first('main')
+        if main_node:
+            return main_node
+
         # TODO: this could be done in smarter way,
         # checking for common parents between all h nodes.
         first_header = body.css_first('h1')
@@ -84,10 +89,8 @@ def _get_main_node(self, html):
         return None
 
     def _parse_content(self, content):
-        """Removes new line characters and strips all whitespaces."""
-        content = content.strip().split('\n')
-
-        # Convert all new lines to " "
+        """Converts all new line characters and multiple spaces to a single space."""
+        content = content.strip().split()
         content = (text.strip() for text in content)
         content = ' '.join(text for text in content if text)
         return content
@@ -106,12 +109,12 @@ def _parse_sections(self, title, body):
         body = self._clean_body(body)
 
         # Index content for pages that don't start with a title.
-        # We check for sections till 2 levels to avoid indexing all the content
+        # We check for sections till 3 levels to avoid indexing all the content
         # in this step.
         try:
             content, _ = self._parse_section_content(
                 body.child,
-                depth=2,
+                depth=3,
             )
             if content:
                 yield {
@@ -128,7 +131,7 @@ def _parse_sections(self, title, body):
             for tag in tags:
                 try:
                     title, id = self._parse_section_title(tag)
-                    content, _ = self._parse_section_content(tag.next)
+                    content, _ = self._parse_section_content(tag.next, depth=2)
                     yield {
                         'id': id,
                         'title': title,
@@ -146,7 +149,11 @@ def _clean_body(self, body):
            This will mutate the original `body`.
         """
         # Remove all navigation nodes
-        nodes_to_be_removed = body.css('[role=navigation]')
+        nodes_to_be_removed = itertools.chain(
+            body.css('nav'),
+            body.css('[role=navigation]'),
+            body.css('[role=search]'),
+        )
         for node in nodes_to_be_removed:
             node.decompose()
 
@@ -156,23 +163,10 @@ def _is_section(self, tag):
         """
         Check if `tag` is a section (linkeable header).
 
-        The tag is a section if:
-
-        - It's a ``h`` tag.
-        - It's a div with a ``section`` class.
+        The tag is a section if it's a ``h`` tag.
         """
         is_header_tag = re.match(r'h\d$', tag.tag)
-        if is_header_tag:
-            return True
-
-        is_div_section = (
-            tag.tag == 'div' and
-            'section' in tag.attributes.get('class', '').split()
-        )
-        if is_div_section:
-            return True
-
-        return False
+        return is_header_tag
 
     def _parse_section_title(self, tag):
         """
@@ -187,7 +181,7 @@ def _parse_section_title(self, tag):
 
         - Removes permalink values
         """
-        nodes_to_be_removed = tag.css('a.headerlink')
+        nodes_to_be_removed = tag.css('.headerlink')
         for node in nodes_to_be_removed:
             node.decompose()
 
@@ -219,7 +213,9 @@ def _parse_section_content(self, tag, *, depth=0):
             if self._is_code_section(next_tag):
                 content = self._parse_code_section(next_tag)
             elif depth <= 0 or not next_tag.child:
-                content = self._parse_content(next_tag.text())
+                # Calling .text() with deep `True` over a text node will return empty.
+                deep = next_tag.tag != '-text'
+                content = next_tag.text(deep=deep)
             else:
                 content, section_found = self._parse_section_content(
                     tag=next_tag.child,
@@ -230,7 +226,7 @@ def _parse_section_content(self, tag, *, depth=0):
                 contents.append(content)
             next_tag = next_tag.next
 
-        return ' '.join(contents), section_found
+        return self._parse_content(''.join(contents)), section_found
 
     def _is_code_section(self, tag):
         """
diff --git a/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json b/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json
@@ -6,7 +6,7 @@
       {
         "id": "mkdocs",
         "title": "MkDocs",
-        "content": "Project documentation with\u00a0Markdown."
+        "content": "Project documentation with Markdown."
       },
       {
         "id": "overview",
diff --git a/readthedocs/search/tests/data/sphinx/out/page.json b/readthedocs/search/tests/data/sphinx/out/page.json
@@ -25,7 +25,7 @@
     {
       "id": "adding-a-new-scenario-to-the-repository",
       "title": "Adding a new scenario to the repository",
-      "content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex'  # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow()  # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
+      "content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
     },
     {
       "content": "This is a H3 title.",

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`{`
`7`	`7`	`"id": "mkdocs",`
`8`	`8`	`"title": "MkDocs",`
`9`		`- "content": "Project documentation with\u00a0Markdown."`
	`9`	`+ "content": "Project documentation with Markdown."`
`10`	`10`	`},`
`11`	`11`	`{`
`12`	`12`	`"id": "overview",`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@`
`25`	`25`	`{`
`26`	`26`	`"id": "adding-a-new-scenario-to-the-repository",`
`27`	`27`	`"title": "Adding a new scenario to the repository",`
`28`		`- "content": "Sphinx configuration file used to build this docs: # -- coding: utf-8 -- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"`
	`28`	`+ "content": "Sphinx configuration file used to build this docs: # -- coding: utf-8 -- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"`
`29`	`29`	`},`
`30`	`30`	`{`
`31`	`31`	`"content": "This is a H3 title.",`