readthedocs · ericholscher · Aug 27, 2019 · Jul 23, 2019 · Jul 23, 2019 · Jul 23, 2019
diff --git a/media/css/core.css b/media/css/core.css
@@ -379,12 +379,11 @@ a.cta-btn:hover, a.cta-btn:active {
 
 /* search */
 
-.search {
-    border-bottom: solid 1px #bfbfbf;
-    margin-bottom: 24px;
-}
+.search { border-bottom: solid 1px #bfbfbf; margin-bottom: 24px; }
 .search input[type=text] { float: left; margin-right: 10px; padding: 8px 10px; }
 .search input[type=submit] { margin-top: 0; }
+/* this is same as the css class ".highlighted" */
+.search-result-item span { background-color: #ee9; padding: 0 1px; margin: 0 1px; border-radius: 3px; -moz-border-radius: 3px; -webkit-border-radius: 3px; }
 
 .filter { margin-bottom: 1em; }
 .filter dd { display: inline-block; margin-right: 0.75em; }

diff --git a/readthedocs/core/static-src/core/js/doc-embed/search.js b/readthedocs/core/static-src/core/js/doc-embed/search.js
@@ -18,6 +18,7 @@ function attach_elastic_search_query(data) {
     var version = data.version;
     var language = data.language || 'en';
     var api_host = data.api_host;
+    var canonical_url = data.canonical_url;
 
     var query_override = function (query) {
         var search_def = $.Deferred();
@@ -49,10 +50,9 @@ function attach_elastic_search_query(data) {
                             }
                         }
 
-                        // Creating the result from elements
-                        var link = doc.link + DOCUMENTATION_OPTIONS.FILE_SUFFIX + "?highlight=" + $.urlencode(query);
-
+                        var link = canonical_url + doc.full_path + "?highlight=" + $.urlencode(query);
                         var item = $('<a>', {'href': link});
+
                         item.html(title);
                         item.find('span').addClass('highlighted');
                         list_item.append(item);
@@ -61,7 +61,6 @@ function attach_elastic_search_query(data) {
                         if (doc.project !== project) {
                             var text = " (from project " + doc.project + ")";
                             var extra = $('<span>', {'text': text});
-
                             list_item.append(extra);
                         }
 
@@ -76,10 +75,12 @@ function attach_elastic_search_query(data) {
                             var content = "";
 
                             var domain = "";
-                            var domain_subtitle = "";
+                            var domain_role_name = "";
                             var domain_subtitle_link = "";
-                            var domain_content = "";
                             var domain_name = "";
+                            var domain_subtitle = "";
+                            var domain_content = "";
+                            var domain_docstrings = "";
 
                             var section_template = '' +
                                 '<div>' +
@@ -109,7 +110,7 @@ function attach_elastic_search_query(data) {
                                 section = inner_hits[j];
                                 section_subtitle = section._source.title;
                                 section_subtitle_link = link + "#" + section._source.id;
-                                section_content = [section._source.content.substring(0, MAX_SUBSTRING_LIMIT) + " ..."];
+                                section_content = [section._source.content.substr(0, MAX_SUBSTRING_LIMIT) + " ..."];
 
                                 if (section.highlight) {
                                     if (section.highlight["sections.title"]) {
@@ -145,27 +146,29 @@ function attach_elastic_search_query(data) {
                             if (inner_hits[j].type === "domains") {
 
                                 domain = inner_hits[j];
-                                domain_subtitle = domain._source.role_name;
+                                domain_role_name = domain._source.role_name;
                                 domain_subtitle_link = link + "#" + domain._source.anchor;
-                                domain_content = "";
                                 domain_name = domain._source.name;
+                                domain_subtitle = "";
+                                domain_content = "";
+                                domain_docstrings = "";
 
-                                if (
-                                    typeof domain._source.display_name === "string" &&
-                                    domain._source.display_name.length >= 1
-                                ) {
-                                    domain_subtitle = "(" + domain._source.role_name + ") " + domain._source.display_name;
+                                if (domain._source.docstrings !== "") {
+                                    domain_docstrings = domain._source.docstrings.substr(0, MAX_SUBSTRING_LIMIT) + " ...";
                                 }
 
                                 if (domain.highlight) {
+                                    if (domain.highlight["domains.docstrings"]) {
+                                        domain_docstrings = "... " + xss(domain.highlight["domains.docstrings"][0]) + " ...";
+                                    }
+
                                     if (domain.highlight["domains.name"]) {
-                                        // domain_content = type_display -- name
                                         domain_name = xss(domain.highlight["domains.name"][0]);
                                     }
                                 }
 
-                                // domain_content = type_display -- name -- in doc_display
-                                domain_content = domain._source.type_display + " -- " + domain_name + " -- in " + domain._source.doc_display;
+                                domain_subtitle = "[" + domain_role_name + "]: " + domain_name;
+                                domain_content = domain_docstrings;
 
                                 contents.append(
                                     $u.template(

diff --git a/readthedocs/core/static/core/js/readthedocs-doc-embed.js b/readthedocs/core/static/core/js/readthedocs-doc-embed.js
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1271,6 +1271,7 @@ def get_processed_json(self):
             'path': file_path,
             'title': '',
             'sections': [],
+            'domain_data': {},
         }
 
     @cached_property

diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -22,17 +22,10 @@ class PageSearchSerializer(serializers.Serializer):
     project = serializers.CharField()
     version = serializers.CharField()
     title = serializers.CharField()
-    path = serializers.CharField()
-    link = serializers.SerializerMethodField()
+    full_path = serializers.CharField()
     highlight = serializers.SerializerMethodField()
     inner_hits = serializers.SerializerMethodField()
 
-    def get_link(self, obj):
-        projects_url = self.context.get('projects_url')
-        if projects_url:
-            docs_url = projects_url[obj.project]
-            return docs_url + obj.path
-
     def get_highlight(self, obj):
         highlight = getattr(obj.meta, 'highlight', None)
         if highlight:

diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -70,7 +70,6 @@ class PageDocument(RTDDocTypeMixin, DocType):
     # Metadata
     project = fields.KeywordField(attr='project.slug')
     version = fields.KeywordField(attr='version.slug')
-    path = fields.KeywordField(attr='processed_json.path')
     full_path = fields.KeywordField(attr='path')
 
     # Searchable content
@@ -88,17 +87,14 @@ class PageDocument(RTDDocTypeMixin, DocType):
             'role_name': fields.KeywordField(),
 
             # For linking to the URL
-            'doc_name': fields.KeywordField(),
             'anchor': fields.KeywordField(),
 
             # For showing in the search result
-            'type_display': fields.TextField(),
-            'doc_display': fields.TextField(),
+            'docstrings': fields.TextField(),
 
             # Simple analyzer breaks on `.`,
             # otherwise search results are too strict for this use case
             'name': fields.TextField(analyzer='simple'),
-            'display_name': fields.TextField(analyzer='simple'),
         }
     )
 
@@ -122,12 +118,9 @@ def prepare_domains(self, html_file):
             all_domains = [
                 {
                     'role_name': domain.role_name,
-                    'doc_name': domain.doc_name,
                     'anchor': domain.anchor,
-                    'type_display': domain.type_display,
-                    'doc_display': domain.doc_display,
+                    'docstrings': html_file.processed_json.get('domain_data', {}).get(domain.anchor, ''),
                     'name': domain.name,
-                    'display_name': domain.display_name if domain.display_name != '-' else '',
                 }
                 for domain in domains_qs
             ]

diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -104,9 +104,8 @@ class PageSearchBase(RTDFacetedSearch):
     _outer_fields = ['title^4']
     _section_fields = ['sections.title^3', 'sections.content']
     _domain_fields = [
-        'domains.type_display',
         'domains.name^2',
-        'domains.display_name',
+        'domains.docstrings',
     ]
     _common_highlight_options = {
         'encoder': 'html',
@@ -134,8 +133,17 @@ def query(self, search, query):
         """Manipulates query to support nested query."""
         search = search.highlight_options(**self._common_highlight_options)
 
+        all_queries = []
+
         # match query for the title (of the page) field.
-        match_title_query = Match(title=query)
+        for operator in self.operators:
+            all_queries.append(
+                SimpleQueryString(
+                    query=query,
+                    fields=self.fields,
+                    default_operator=operator
+                )
+            )
 
         # nested query for search in sections
         sections_nested_query = self.generate_nested_query(
@@ -162,21 +170,17 @@ def query(self, search, query):
                 'highlight': dict(
                     self._common_highlight_options,
                     fields={
-                        'domains.type_display': {},
                         'domains.name': {},
-                        'domains.display_name': {},
+                        'domains.docstrings': {},
                     }
                 )
             }
         )
 
-        final_query = Bool(should=[
-            match_title_query,
-            sections_nested_query,
-            domains_nested_query,
-        ])
-
+        all_queries.extend([sections_nested_query, domains_nested_query])
+        final_query = Bool(should=all_queries)
         search = search.query(final_query)
+
         return search
 
     def generate_nested_query(self, query, path, fields, inner_hits):

diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -10,8 +10,23 @@
 log = logging.getLogger(__name__)
 
 
-def generate_sections_from_pyquery(body):
+def generate_sections_from_pyquery(body, fjson_filename):
     """Given a pyquery object, generate section dicts for each section."""
+
+    # Removing all <dl> tags to prevent duplicate indexing with Sphinx Domains.
+    try:
+        # remove all <dl> tags which contains <dt> tags having 'id' attribute
+        dt_tags = body('dt[id]')
+        dt_tags.parents('dl').remove()
+    except Exception:
+        log.exception('Error removing <dl> tags from file: %s', fjson_filename)
+
+    # remove toctree elements
+    try:
+        body('.toctree-wrapper').remove()
+    except Exception:
+        log.exception('Error removing toctree elements from file: %s', fjson_filename)
+
     # Capture text inside h1 before the first h2
     h1_section = body('.section > h1')
     if h1_section:
@@ -25,7 +40,12 @@ def generate_sections_from_pyquery(body):
                 if 'section' in next_p[0].attrib['class']:
                     break
 
-            h1_content += parse_content(next_p.text())
+            text = parse_content(next_p.text(), remove_first_line=True)
+            if h1_content:
+                h1_content = f'{h1_content.rstrip(".")}. {text}'
+            else:
+                h1_content = text
+
             next_p = next_p.next()
         if h1_content:
             yield {
@@ -43,7 +63,7 @@ def generate_sections_from_pyquery(body):
         section_id = div.attr('id')
 
         content = div.text()
-        content = parse_content(content)
+        content = parse_content(content, remove_first_line=True)
 
         yield {
             'id': section_id,
@@ -64,15 +84,12 @@ def process_file(fjson_filename):
     sections = []
     path = ''
     title = ''
-
-    if 'current_page_name' in data:
-        path = data['current_page_name']
-    else:
-        log.info('Unable to index file due to no name %s', fjson_filename)
+    domain_data = {}
 
     if data.get('body'):
         body = PyQuery(data['body'])
-        sections.extend(generate_sections_from_pyquery(body))
+        sections.extend(generate_sections_from_pyquery(body.clone(), fjson_filename))
+        domain_data = generate_domains_data_from_pyquery(body.clone(), fjson_filename)
     else:
         log.info('Unable to index content for: %s', fjson_filename)
 
@@ -86,24 +103,70 @@ def process_file(fjson_filename):
         'path': path,
         'title': title,
         'sections': sections,
+        'domain_data': domain_data,
     }
 
 
-def parse_content(content):
-    """
-    Removes the starting text and ¶.
-
-    It removes the starting text from the content
-    because it contains the title of that content,
-    which is redundant here.
-    """
+def parse_content(content, remove_first_line=False):
+    """Removes new line characters and ¶."""
     content = content.replace('¶', '').strip()
 
     # removing the starting text of each
     content = content.split('\n')
-    if len(content) > 1:  # there were \n
+    if remove_first_line and len(content) > 1:
         content = content[1:]
 
     # converting newlines to ". "
     content = '. '.join([text.strip().rstrip('.') for text in content])
     return content
+
+
+def _get_text_for_domain_data(desc_contents):
+    """Returns the text from the PyQuery object ``desc_contents``."""
+    # remove the 'dl', 'dt' and 'dd' tags from it
+    # because all the 'dd' and 'dt' tags are inside 'dl'
+    # and all 'dl' tags are already captured.
+    desc_contents.remove('dl')
+    desc_contents.remove('dt')
+    desc_contents.remove('dd')
+
+    # remove multiple spaces, new line characters and '¶' symbol.
+    docstrings = parse_content(desc_contents.text())
+    return docstrings
+
+
+def generate_domains_data_from_pyquery(body, fjson_filename):
+    """
+    Given a pyquery object, generate sphinx domain objects' docstrings.
+
+    Returns a dict with the generated data.
+    The returned dict is in the following form::
+
+        {
+            "domain-id-1": "docstrings for the domain-id-1",
+            "domain-id-2": "docstrings for the domain-id-2",
+        }
+    """
+
+    domain_data = {}
+    dl_tags = body('dl')
+
+    for dl_tag in dl_tags:
+
+        dt = dl_tag.findall('dt')
+        dd = dl_tag.findall('dd')
+
+        # len(dt) should be equal to len(dd)
+        # because these tags go together.
+        for title, desc in zip(dt, dd):
+            try:
+                id_ = title.attrib.get('id')
+                if id_:
+                    # clone the PyQuery objects so that
+                    # the original one remains undisturbed
+                    docstrings = _get_text_for_domain_data(PyQuery(desc).clone())
+                    domain_data[id_] = docstrings
+            except Exception:
+                log.exception('Error parsing docstrings for domains in file %s', fjson_filename)
+
+    return domain_data