Skip to content

Commit f1c15d4

Browse files
authored
Merge pull request readthedocs#5122 from rtfd/humitos/sitemap-xml
Generate general sitemap.xml for projects
2 parents d901982 + 6482bac commit f1c15d4

File tree

6 files changed

+229
-12
lines changed

6 files changed

+229
-12
lines changed

common

docs/features/sitemaps.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Sitemaps
2+
========
3+
4+
Sitemaps_ allows us to inform search engines about URLs that are available for crawling
5+
and communicate them additional information about each URL of the project:
6+
7+
* when it was last updated,
8+
* how often it changes,
9+
* how important it is in relation to other URLs in the site, and
10+
* what translations are available for a page.
11+
12+
Read the Docs automatically generates a sitemap for each project that hosts
13+
to improve results when performing a search on these search engines.
14+
This allow us to prioritize results based on the version number, for example
15+
to show ``latest`` as the top result followed by ``stable`` and then all the project's
16+
versions sorted following semver_.
17+
18+
.. _semver: https://semver.org/
19+
.. _Sitemaps: https://www.sitemaps.org/

readthedocs/core/urls/subdomain.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# -*- coding: utf-8 -*-
2-
31
"""URL configurations for subdomains."""
42
from functools import reduce
53
from operator import add
@@ -15,14 +13,16 @@
1513
redirect_project_slug,
1614
robots_txt,
1715
serve_docs,
16+
sitemap_xml,
1817
)
1918

2019

2120
handler500 = server_error_500
2221
handler404 = server_error_404
2322

2423
subdomain_urls = [
25-
url(r'robots.txt$', robots_txt, name='robots_txt'),
24+
url(r'robots\.txt$', robots_txt, name='robots_txt'),
25+
url(r'sitemap\.xml$', sitemap_xml, name='sitemap_xml'),
2626
url(
2727
r'^(?:|projects/(?P<subproject_slug>{project_slug})/)'
2828
r'page/(?P<filename>.*)$'.format(**pattern_opts),

readthedocs/core/views/serve.py

Lines changed: 122 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# -*- coding: utf-8 -*-
2-
31
"""
42
Doc serving from Python.
53
@@ -26,6 +24,7 @@
2624
SERVE_DOCS (['private']) - The list of ['private', 'public'] docs to serve.
2725
"""
2826

27+
import itertools
2928
import logging
3029
import mimetypes
3130
import os
@@ -36,6 +35,7 @@
3635
from django.http import Http404, HttpResponse, HttpResponseRedirect
3736
from django.shortcuts import get_object_or_404, render
3837
from django.utils.encoding import iri_to_uri
38+
from django.views.decorators.cache import cache_page
3939
from django.views.static import serve
4040

4141
from readthedocs.builds.models import Version
@@ -44,6 +44,7 @@
4444
from readthedocs.core.symlink import PrivateSymlink, PublicSymlink
4545
from readthedocs.projects import constants
4646
from readthedocs.projects.models import Project, ProjectRelationship
47+
from readthedocs.projects.templatetags.projects_tags import sort_version_aware
4748

4849

4950
log = logging.getLogger(__name__)
@@ -262,7 +263,7 @@ def _serve_symlink_docs(request, project, privacy_level, filename=''):
262263
files_tried.append(os.path.join(basepath, filename))
263264

264265
raise Http404(
265-
'File not found. Tried these files: %s' % ','.join(files_tried),
266+
'File not found. Tried these files: {}'.format(','.join(files_tried)),
266267
)
267268

268269

@@ -309,4 +310,121 @@ def robots_txt(request, project):
309310
if os.path.exists(fullpath):
310311
return HttpResponse(open(fullpath).read(), content_type='text/plain')
311312

312-
return HttpResponse('User-agent: *\nAllow: /\n', content_type='text/plain')
313+
sitemap_url = '{scheme}://{domain}/sitemap.xml'.format(
314+
scheme='https',
315+
domain=project.subdomain(),
316+
)
317+
return HttpResponse(
318+
'User-agent: *\nAllow: /\nSitemap: {}\n'.format(sitemap_url),
319+
content_type='text/plain',
320+
)
321+
322+
323+
@map_project_slug
324+
@cache_page(60 * 60 * 24 * 3) # 3 days
325+
def sitemap_xml(request, project):
326+
"""
327+
Generate and serve a ``sitemap.xml`` for a particular ``project``.
328+
329+
The sitemap is generated from all the ``active`` and public versions of
330+
``project``. These versions are sorted by using semantic versioning
331+
prepending ``latest`` and ``stable`` (if they are enabled) at the beginning.
332+
333+
Following this order, the versions are assigned priorities and change
334+
frequency. Starting from 1 and decreasing by 0.1 for priorities and starting
335+
from daily, weekly to monthly for change frequency.
336+
337+
If the project is private, the view raises ``Http404``. On the other hand,
338+
if the project is public but a version is private, this one is not included
339+
in the sitemap.
340+
341+
:param request: Django request object
342+
:param project: Project instance to generate the sitemap
343+
344+
:returns: response with the ``sitemap.xml`` template rendered
345+
346+
:rtype: django.http.HttpResponse
347+
"""
348+
349+
def priorities_generator():
350+
"""
351+
Generator returning ``priority`` needed by sitemap.xml.
352+
353+
It generates values from 1 to 0.1 by decreasing in 0.1 on each
354+
iteration. After 0.1 is reached, it will keep returning 0.1.
355+
"""
356+
priorities = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
357+
yield from itertools.chain(priorities, itertools.repeat(0.1))
358+
359+
def changefreqs_generator():
360+
"""
361+
Generator returning ``changefreq`` needed by sitemap.xml.
362+
363+
It returns ``daily`` on first iteration, then ``weekly`` and then it
364+
will return always ``monthly``.
365+
366+
We are using ``monthly`` as last value because ``never`` is too
367+
aggressive. If the tag is removed and a branch is created with the same
368+
name, we will want bots to revisit this.
369+
"""
370+
changefreqs = ['daily', 'weekly']
371+
yield from itertools.chain(changefreqs, itertools.repeat('monthly'))
372+
373+
if project.privacy_level == constants.PRIVATE:
374+
raise Http404
375+
376+
sorted_versions = sort_version_aware(
377+
Version.objects.public(
378+
project=project,
379+
only_active=True,
380+
),
381+
)
382+
383+
versions = []
384+
for version, priority, changefreq in zip(
385+
sorted_versions,
386+
priorities_generator(),
387+
changefreqs_generator(),
388+
):
389+
element = {
390+
'loc': version.get_subdomain_url(),
391+
'priority': priority,
392+
'changefreq': changefreq,
393+
'languages': [],
394+
}
395+
396+
# Version can be enabled, but not ``built`` yet. We want to show the
397+
# link without a ``lastmod`` attribute
398+
last_build = version.builds.order_by('-date').first()
399+
if last_build:
400+
element['lastmod'] = last_build.date.isoformat()
401+
402+
if project.translations.exists():
403+
for translation in project.translations.all():
404+
href = project.get_docs_url(
405+
version_slug=version.slug,
406+
lang_slug=translation.language,
407+
private=version.privacy_level == constants.PRIVATE,
408+
)
409+
element['languages'].append({
410+
'hreflang': translation.language,
411+
'href': href,
412+
})
413+
414+
# Add itself also as protocol requires
415+
element['languages'].append({
416+
'hreflang': project.language,
417+
'href': element['loc'],
418+
})
419+
420+
versions.append(element)
421+
422+
context = {
423+
'versions': versions,
424+
}
425+
return render(
426+
request,
427+
'sitemap.xml',
428+
context,
429+
content_type='application/xml',
430+
)

readthedocs/rtd_tests/tests/test_doc_serving.py

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1-
# -*- coding: utf-8 -*-
1+
2+
import os
23

34
import django_dynamic_fixture as fixture
45
import mock
5-
import os
66
from django.conf import settings
77
from django.contrib.auth.models import User
88
from django.http import Http404
9-
from django.test import TestCase, RequestFactory
9+
from django.test import RequestFactory, TestCase
1010
from django.test.utils import override_settings
1111
from django.urls import reverse
1212
from mock import mock_open, patch
1313

14+
from readthedocs.builds.models import Version
1415
from readthedocs.core.middleware import SubdomainMiddleware
1516
from readthedocs.core.views import server_error_404_subdomain
1617
from readthedocs.core.views.serve import _serve_symlink_docs
@@ -102,6 +103,26 @@ def test_robots_txt(self):
102103
# Private projects/versions always return 404 for robots.txt
103104
self.assertEqual(response.status_code, 404)
104105

106+
@override_settings(
107+
USE_SUBDOMAIN=True,
108+
PUBLIC_DOMAIN='readthedocs.io',
109+
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
110+
)
111+
def test_sitemap_xml(self):
112+
response = self.client.get(
113+
reverse('sitemap_xml'),
114+
HTTP_HOST='private.readthedocs.io',
115+
)
116+
self.assertEqual(response.status_code, 404)
117+
118+
self.client.force_login(self.eric)
119+
response = self.client.get(
120+
reverse('sitemap_xml'),
121+
HTTP_HOST='private.readthedocs.io',
122+
)
123+
# Private projects/versions always return 404 for robots.txt
124+
self.assertEqual(response.status_code, 404)
125+
105126

106127
@override_settings(SERVE_DOCS=[constants.PRIVATE, constants.PUBLIC])
107128
class TestPublicDocs(BaseDocServing):
@@ -149,7 +170,7 @@ def test_default_robots_txt(self):
149170
HTTP_HOST='public.readthedocs.io',
150171
)
151172
self.assertEqual(response.status_code, 200)
152-
self.assertEqual(response.content, b'User-agent: *\nAllow: /\n')
173+
self.assertEqual(response.content, b'User-agent: *\nAllow: /\nSitemap: https://public.readthedocs.io/sitemap.xml\n')
153174

154175
@override_settings(
155176
PYTHON_MEDIA=False,
@@ -179,6 +200,7 @@ def test_custom_robots_txt(self, os_mock, open_mock):
179200
PUBLIC_DOMAIN='readthedocs.io',
180201
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
181202
)
203+
182204
@patch('readthedocs.core.views.serve.os')
183205
@patch('readthedocs.core.views.os')
184206
def test_custom_404_page(self, os_view_mock, os_serve_mock):
@@ -200,3 +222,41 @@ def test_custom_404_page(self, os_view_mock, os_serve_mock):
200222
response = server_error_404_subdomain(request)
201223
self.assertEqual(response.status_code, 404)
202224
self.assertTrue(response['X-Accel-Redirect'].endswith('/public/en/latest/404.html'))
225+
226+
@override_settings(
227+
USE_SUBDOMAIN=True,
228+
PUBLIC_DOMAIN='readthedocs.io',
229+
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
230+
)
231+
def test_sitemap_xml(self):
232+
self.public.versions.update(active=True)
233+
private_version = fixture.get(
234+
Version,
235+
privacy_level=constants.PRIVATE,
236+
project=self.public,
237+
)
238+
response = self.client.get(
239+
reverse('sitemap_xml'),
240+
HTTP_HOST='public.readthedocs.io',
241+
)
242+
self.assertEqual(response.status_code, 200)
243+
self.assertEqual(response['Content-Type'], 'application/xml')
244+
for version in self.public.versions.filter(privacy_level=constants.PUBLIC):
245+
self.assertContains(
246+
response,
247+
self.public.get_docs_url(
248+
version_slug=version.slug,
249+
lang_slug=self.public.language,
250+
private=False,
251+
),
252+
)
253+
254+
# stable is marked as PRIVATE and should not appear here
255+
self.assertNotContains(
256+
response,
257+
self.public.get_docs_url(
258+
version_slug=private_version.slug,
259+
lang_slug=self.public.language,
260+
private=True,
261+
),
262+
)

readthedocs/templates/sitemap.xml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
3+
xmlns:xhtml="http://www.w3.org/1999/xhtml">
4+
{% for version in versions %}
5+
<url>
6+
<loc>{{ version.loc }}</loc>
7+
{% for language in version.languages %}
8+
<xhtml:link
9+
rel="alternate"
10+
hreflang="{{ language.hreflang }}"
11+
href="{{ language.href }}"/>
12+
{% endfor %}
13+
{% if version.lastmod %}
14+
<lastmod>{{ version.lastmod }}</lastmod>
15+
{% endif %}
16+
<changefreq>{{ version.changefreq }}</changefreq>
17+
<priority>{{ version.priority }}</priority>
18+
</url>
19+
{% endfor %}
20+
</urlset>

0 commit comments

Comments
 (0)