Skip to content

Commit 00ab116

Browse files
authored
Merge pull request #5979 from dojutsu-user/index-more-domain-data
Index more domain data into elasticsearch
2 parents 9fb0b69 + 4beee77 commit 00ab116

File tree

17 files changed

+279
-220
lines changed

17 files changed

+279
-220
lines changed

media/css/core.css

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -379,12 +379,11 @@ a.cta-btn:hover, a.cta-btn:active {
379379

380380
/* search */
381381

382-
.search {
383-
border-bottom: solid 1px #bfbfbf;
384-
margin-bottom: 24px;
385-
}
382+
.search { border-bottom: solid 1px #bfbfbf; margin-bottom: 24px; }
386383
.search input[type=text] { float: left; margin-right: 10px; padding: 8px 10px; }
387384
.search input[type=submit] { margin-top: 0; }
385+
/* this is same as the css class ".highlighted" */
386+
.search-result-item span { background-color: #ee9; padding: 0 1px; margin: 0 1px; border-radius: 3px; -moz-border-radius: 3px; -webkit-border-radius: 3px; }
388387

389388
.filter { margin-bottom: 1em; }
390389
.filter dd { display: inline-block; margin-right: 0.75em; }

readthedocs/core/static-src/core/js/doc-embed/search.js

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ function attach_elastic_search_query(data) {
7878

7979
// Creating the result from elements
8080
var link = doc.link + DOCUMENTATION_OPTIONS.FILE_SUFFIX + "?highlight=" + $.urlencode(query);
81-
8281
var item = $('<a>', {'href': link});
82+
8383
item.html(title);
8484
item.find('span').addClass('highlighted');
8585
list_item.append(item);
@@ -88,7 +88,6 @@ function attach_elastic_search_query(data) {
8888
if (doc.project !== project) {
8989
var text = " (from project " + doc.project + ")";
9090
var extra = $('<span>', {'text': text});
91-
9291
list_item.append(extra);
9392
}
9493

@@ -103,10 +102,12 @@ function attach_elastic_search_query(data) {
103102
var content = "";
104103

105104
var domain = "";
106-
var domain_subtitle = "";
105+
var domain_role_name = "";
107106
var domain_subtitle_link = "";
108-
var domain_content = "";
109107
var domain_name = "";
108+
var domain_subtitle = "";
109+
var domain_content = "";
110+
var domain_docstrings = "";
110111

111112
var section_template = '' +
112113
'<div>' +
@@ -136,7 +137,7 @@ function attach_elastic_search_query(data) {
136137
section = inner_hits[j];
137138
section_subtitle = section._source.title;
138139
section_subtitle_link = link + "#" + section._source.id;
139-
section_content = [section._source.content.substring(0, MAX_SUBSTRING_LIMIT) + " ..."];
140+
section_content = [section._source.content.substr(0, MAX_SUBSTRING_LIMIT) + " ..."];
140141

141142
if (section.highlight) {
142143
if (section.highlight["sections.title"]) {
@@ -171,27 +172,29 @@ function attach_elastic_search_query(data) {
171172
if (inner_hits[j].type === "domains") {
172173

173174
domain = inner_hits[j];
174-
domain_subtitle = domain._source.role_name;
175+
domain_role_name = domain._source.role_name;
175176
domain_subtitle_link = link + "#" + domain._source.anchor;
176-
domain_content = "";
177177
domain_name = domain._source.name;
178+
domain_subtitle = "";
179+
domain_content = "";
180+
domain_docstrings = "";
178181

179-
if (
180-
typeof domain._source.display_name === "string" &&
181-
domain._source.display_name.length >= 1
182-
) {
183-
domain_subtitle = "(" + domain._source.role_name + ") " + domain._source.display_name;
182+
if (domain._source.docstrings !== "") {
183+
domain_docstrings = domain._source.docstrings.substr(0, MAX_SUBSTRING_LIMIT) + " ...";
184184
}
185185

186186
if (domain.highlight) {
187+
if (domain.highlight["domains.docstrings"]) {
188+
domain_docstrings = "... " + xss(domain.highlight["domains.docstrings"][0]) + " ...";
189+
}
190+
187191
if (domain.highlight["domains.name"]) {
188-
// domain_content = type_display -- name
189192
domain_name = xss(domain.highlight["domains.name"][0]);
190193
}
191194
}
192195

193-
// domain_content = type_display -- name -- in doc_display
194-
domain_content = domain._source.type_display + " -- " + domain_name + " -- in " + domain._source.doc_display;
196+
domain_subtitle = "[" + domain_role_name + "]: " + domain_name;
197+
domain_content = domain_docstrings;
195198

196199
append_html_to_contents(
197200
contents,

readthedocs/core/static/core/js/readthedocs-doc-embed.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

readthedocs/projects/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,7 @@ def get_processed_json(self):
13051305
'path': file_path,
13061306
'title': '',
13071307
'sections': [],
1308+
'domain_data': {},
13081309
}
13091310

13101311
@cached_property

readthedocs/projects/static/projects/js/tools.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

readthedocs/search/documents.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,17 +88,15 @@ class PageDocument(RTDDocTypeMixin, DocType):
8888
'role_name': fields.KeywordField(),
8989

9090
# For linking to the URL
91-
'doc_name': fields.KeywordField(),
9291
'anchor': fields.KeywordField(),
9392

9493
# For showing in the search result
9594
'type_display': fields.TextField(),
96-
'doc_display': fields.TextField(),
95+
'docstrings': fields.TextField(),
9796

9897
# Simple analyzer breaks on `.`,
9998
# otherwise search results are too strict for this use case
10099
'name': fields.TextField(analyzer='simple'),
101-
'display_name': fields.TextField(analyzer='simple'),
102100
}
103101
)
104102

@@ -122,12 +120,12 @@ def prepare_domains(self, html_file):
122120
all_domains = [
123121
{
124122
'role_name': domain.role_name,
125-
'doc_name': domain.doc_name,
126123
'anchor': domain.anchor,
127124
'type_display': domain.type_display,
128-
'doc_display': domain.doc_display,
125+
'docstrings': html_file.processed_json.get(
126+
'domain_data', {}
127+
).get(domain.anchor, ''),
129128
'name': domain.name,
130-
'display_name': domain.display_name if domain.display_name != '-' else '',
131129
}
132130
for domain in domains_qs
133131
]

readthedocs/search/faceted_search.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,8 @@ class PageSearchBase(RTDFacetedSearch):
104104
_outer_fields = ['title^4']
105105
_section_fields = ['sections.title^3', 'sections.content']
106106
_domain_fields = [
107-
'domains.type_display',
108107
'domains.name^2',
109-
'domains.display_name',
108+
'domains.docstrings',
110109
]
111110
_common_highlight_options = {
112111
'encoder': 'html',
@@ -134,8 +133,17 @@ def query(self, search, query):
134133
"""Manipulates query to support nested query."""
135134
search = search.highlight_options(**self._common_highlight_options)
136135

136+
all_queries = []
137+
137138
# match query for the title (of the page) field.
138-
match_title_query = Match(title=query)
139+
for operator in self.operators:
140+
all_queries.append(
141+
SimpleQueryString(
142+
query=query,
143+
fields=self.fields,
144+
default_operator=operator
145+
)
146+
)
139147

140148
# nested query for search in sections
141149
sections_nested_query = self.generate_nested_query(
@@ -162,21 +170,17 @@ def query(self, search, query):
162170
'highlight': dict(
163171
self._common_highlight_options,
164172
fields={
165-
'domains.type_display': {},
166173
'domains.name': {},
167-
'domains.display_name': {},
174+
'domains.docstrings': {},
168175
}
169176
)
170177
}
171178
)
172179

173-
final_query = Bool(should=[
174-
match_title_query,
175-
sections_nested_query,
176-
domains_nested_query,
177-
])
178-
180+
all_queries.extend([sections_nested_query, domains_nested_query])
181+
final_query = Bool(should=all_queries)
179182
search = search.query(final_query)
183+
180184
return search
181185

182186
def generate_nested_query(self, query, path, fields, inner_hits):

readthedocs/search/parse_json.py

Lines changed: 81 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,23 @@
1212
log = logging.getLogger(__name__)
1313

1414

15-
def generate_sections_from_pyquery(body):
15+
def generate_sections_from_pyquery(body, fjson_storage_path):
1616
"""Given a pyquery object, generate section dicts for each section."""
17+
18+
# Removing all <dl> tags to prevent duplicate indexing with Sphinx Domains.
19+
try:
20+
# remove all <dl> tags which contains <dt> tags having 'id' attribute
21+
dt_tags = body('dt[id]')
22+
dt_tags.parents('dl').remove()
23+
except Exception:
24+
log.exception('Error removing <dl> tags from file: %s', fjson_storage_path)
25+
26+
# remove toctree elements
27+
try:
28+
body('.toctree-wrapper').remove()
29+
except Exception:
30+
log.exception('Error removing toctree elements from file: %s', fjson_storage_path)
31+
1732
# Capture text inside h1 before the first h2
1833
h1_section = body('.section > h1')
1934
if h1_section:
@@ -27,7 +42,12 @@ def generate_sections_from_pyquery(body):
2742
if 'section' in next_p[0].attrib['class']:
2843
break
2944

30-
h1_content += parse_content(next_p.text())
45+
text = parse_content(next_p.text(), remove_first_line=True)
46+
if h1_content:
47+
h1_content = f'{h1_content.rstrip(".")}. {text}'
48+
else:
49+
h1_content = text
50+
3151
next_p = next_p.next()
3252
if h1_content:
3353
yield {
@@ -45,7 +65,7 @@ def generate_sections_from_pyquery(body):
4565
section_id = div.attr('id')
4666

4767
content = div.text()
48-
content = parse_content(content)
68+
content = parse_content(content, remove_first_line=True)
4969

5070
yield {
5171
'id': section_id,
@@ -74,6 +94,7 @@ def process_file(fjson_storage_path):
7494
sections = []
7595
path = ''
7696
title = ''
97+
domain_data = {}
7798

7899
if 'current_page_name' in data:
79100
path = data['current_page_name']
@@ -82,7 +103,8 @@ def process_file(fjson_storage_path):
82103

83104
if data.get('body'):
84105
body = PyQuery(data['body'])
85-
sections.extend(generate_sections_from_pyquery(body))
106+
sections.extend(generate_sections_from_pyquery(body.clone(), fjson_storage_path))
107+
domain_data = generate_domains_data_from_pyquery(body.clone(), fjson_storage_path)
86108
else:
87109
log.info('Unable to index content for: %s', fjson_storage_path)
88110

@@ -96,24 +118,70 @@ def process_file(fjson_storage_path):
96118
'path': path,
97119
'title': title,
98120
'sections': sections,
121+
'domain_data': domain_data,
99122
}
100123

101124

102-
def parse_content(content):
103-
"""
104-
Removes the starting text and ¶.
105-
106-
It removes the starting text from the content
107-
because it contains the title of that content,
108-
which is redundant here.
109-
"""
125+
def parse_content(content, remove_first_line=False):
126+
"""Removes new line characters and ¶."""
110127
content = content.replace('¶', '').strip()
111128

112129
# removing the starting text of each
113130
content = content.split('\n')
114-
if len(content) > 1: # there were \n
131+
if remove_first_line and len(content) > 1:
115132
content = content[1:]
116133

117134
# converting newlines to ". "
118135
content = '. '.join([text.strip().rstrip('.') for text in content])
119136
return content
137+
138+
139+
def _get_text_for_domain_data(desc_contents):
140+
"""Returns the text from the PyQuery object ``desc_contents``."""
141+
# remove the 'dl', 'dt' and 'dd' tags from it
142+
# because all the 'dd' and 'dt' tags are inside 'dl'
143+
# and all 'dl' tags are already captured.
144+
desc_contents.remove('dl')
145+
desc_contents.remove('dt')
146+
desc_contents.remove('dd')
147+
148+
# remove multiple spaces, new line characters and '¶' symbol.
149+
docstrings = parse_content(desc_contents.text())
150+
return docstrings
151+
152+
153+
def generate_domains_data_from_pyquery(body, fjson_storage_path):
154+
"""
155+
Given a pyquery object, generate sphinx domain objects' docstrings.
156+
157+
Returns a dict with the generated data.
158+
The returned dict is in the following form::
159+
160+
{
161+
"domain-id-1": "docstrings for the domain-id-1",
162+
"domain-id-2": "docstrings for the domain-id-2",
163+
}
164+
"""
165+
166+
domain_data = {}
167+
dl_tags = body('dl')
168+
169+
for dl_tag in dl_tags:
170+
171+
dt = dl_tag.findall('dt')
172+
dd = dl_tag.findall('dd')
173+
174+
# len(dt) should be equal to len(dd)
175+
# because these tags go together.
176+
for title, desc in zip(dt, dd):
177+
try:
178+
id_ = title.attrib.get('id')
179+
if id_:
180+
# clone the PyQuery objects so that
181+
# the original one remains undisturbed
182+
docstrings = _get_text_for_domain_data(PyQuery(desc).clone())
183+
domain_data[id_] = docstrings
184+
except Exception:
185+
log.exception('Error parsing docstrings for domains in file %s', fjson_storage_path)
186+
187+
return domain_data

readthedocs/search/tests/data/docs/support.json

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
{
66
"id": "usage-questions",
77
"title": "Usage Questions",
8-
"content": "If you have questions about how to use Read the Docs, or have an issue that isn’t related to a bug, Stack Overflow is the best place to ask. Tag questions with read-the-docs so other folks can find them easily.. Good questions for Stack Overflow would be:. “What is the best way to structure the table of contents across a project?”. “How do I structure translations inside of my project for easiest contribution from users?”. “How do I use Sphinx to use SVG images in HTML output but PNG in PDF output?”"
8+
"content": "For help, Stack Overflow is the palce. Tag questions with read-the-docs so other folks can find them easily.. Good questions for Stack Overflow would be:. “What is the best way to structure the table of contents across a project?”. “How do I structure translations inside of my project for easiest contribution from users?”. “How do I use Sphinx to use SVG images in HTML output but PNG in PDF output?”"
99
},
1010
{
1111
"id": "community-support",
@@ -20,22 +20,20 @@
2020
],
2121
"domains": [
2222
{
23-
"role_name": "http:post",
24-
"doc_name": "api/v3.html",
25-
"anchor": "post--api-v3-projects-(string-project_slug)-versions-(string-version_slug)-builds-",
26-
"type_display": "post",
27-
"doc_display": "API v3",
28-
"name": "/api/v3/projects/(string:project_slug)/versions/(string:version_slug)/builds/",
29-
"display_name": ""
23+
"role_name": "py:function",
24+
"anchor": "celery.utils.deprecated.warn",
25+
"type_display": "function",
26+
"name": "celery.utils.deprecated.warn"
3027
},
3128
{
32-
"role_name": "http:patch",
33-
"doc_name": "api/v3.html",
34-
"anchor": "patch--api-v3-projects-(string-project_slug)-version-(string-version_slug)-",
35-
"type_display": "patch",
36-
"doc_display": "API v3",
37-
"name": "/api/v3/projects/(string:project_slug)/version/(string:version_slug)/",
38-
"display_name": ""
29+
"role_name": "py:function",
30+
"anchor": "celery.utils.deprecated.Property",
31+
"type_display": "function",
32+
"name": "celery.utils.deprecated.Property"
3933
}
40-
]
34+
],
35+
"domain_data": {
36+
"celery.utils.deprecated.warn": "Warn of (pending) deprecation",
37+
"celery.utils.deprecated.Property": "Decorator for deprecated properties"
38+
}
4139
}

0 commit comments

Comments
 (0)