diff --git a/common b/common index f3c2619adb1..ac9eafa1e57 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit f3c2619adb171d1cbfae9c28922dddc46cd98cd1 +Subproject commit ac9eafa1e57c5ab3cff5af9acad1e93c85989e8c diff --git a/docs/features/sitemaps.rst b/docs/features/sitemaps.rst new file mode 100644 index 00000000000..385d46ae4b5 --- /dev/null +++ b/docs/features/sitemaps.rst @@ -0,0 +1,19 @@ +Sitemaps +======== + +Sitemaps_ allows us to inform search engines about URLs that are available for crawling +and communicate them additional information about each URL of the project: + +* when it was last updated, +* how often it changes, +* how important it is in relation to other URLs in the site, and +* what translations are available for a page. + +Read the Docs automatically generates a sitemap for each project that hosts +to improve results when performing a search on these search engines. +This allow us to prioritize results based on the version number, for example +to show ``latest`` as the top result followed by ``stable`` and then all the project's +versions sorted following semver_. + +.. _semver: https://semver.org/ +.. _Sitemaps: https://www.sitemaps.org/ diff --git a/readthedocs/core/urls/subdomain.py b/readthedocs/core/urls/subdomain.py index 4e4a6775e3a..8187668c738 100644 --- a/readthedocs/core/urls/subdomain.py +++ b/readthedocs/core/urls/subdomain.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """URL configurations for subdomains.""" from functools import reduce from operator import add @@ -15,6 +13,7 @@ redirect_project_slug, robots_txt, serve_docs, + sitemap_xml, ) @@ -22,7 +21,8 @@ handler404 = server_error_404 subdomain_urls = [ - url(r'robots.txt$', robots_txt, name='robots_txt'), + url(r'robots\.txt$', robots_txt, name='robots_txt'), + url(r'sitemap\.xml$', sitemap_xml, name='sitemap_xml'), url( r'^(?:|projects/(?P{project_slug})/)' r'page/(?P.*)$'.format(**pattern_opts), diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index d756ab8f3ed..a0212687ab0 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Doc serving from Python. @@ -26,6 +24,7 @@ SERVE_DOCS (['private']) - The list of ['private', 'public'] docs to serve. """ +import itertools import logging import mimetypes import os @@ -36,6 +35,7 @@ from django.http import Http404, HttpResponse, HttpResponseRedirect from django.shortcuts import get_object_or_404, render from django.utils.encoding import iri_to_uri +from django.views.decorators.cache import cache_page from django.views.static import serve from readthedocs.builds.models import Version @@ -44,6 +44,7 @@ from readthedocs.core.symlink import PrivateSymlink, PublicSymlink from readthedocs.projects import constants from readthedocs.projects.models import Project, ProjectRelationship +from readthedocs.projects.templatetags.projects_tags import sort_version_aware log = logging.getLogger(__name__) @@ -262,7 +263,7 @@ def _serve_symlink_docs(request, project, privacy_level, filename=''): files_tried.append(os.path.join(basepath, filename)) raise Http404( - 'File not found. Tried these files: %s' % ','.join(files_tried), + 'File not found. Tried these files: {}'.format(','.join(files_tried)), ) @@ -309,4 +310,121 @@ def robots_txt(request, project): if os.path.exists(fullpath): return HttpResponse(open(fullpath).read(), content_type='text/plain') - return HttpResponse('User-agent: *\nAllow: /\n', content_type='text/plain') + sitemap_url = '{scheme}://{domain}/sitemap.xml'.format( + scheme='https', + domain=project.subdomain(), + ) + return HttpResponse( + 'User-agent: *\nAllow: /\nSitemap: {}\n'.format(sitemap_url), + content_type='text/plain', + ) + + +@map_project_slug +@cache_page(60 * 60 * 24 * 3) # 3 days +def sitemap_xml(request, project): + """ + Generate and serve a ``sitemap.xml`` for a particular ``project``. + + The sitemap is generated from all the ``active`` and public versions of + ``project``. These versions are sorted by using semantic versioning + prepending ``latest`` and ``stable`` (if they are enabled) at the beginning. + + Following this order, the versions are assigned priorities and change + frequency. Starting from 1 and decreasing by 0.1 for priorities and starting + from daily, weekly to monthly for change frequency. + + If the project is private, the view raises ``Http404``. On the other hand, + if the project is public but a version is private, this one is not included + in the sitemap. + + :param request: Django request object + :param project: Project instance to generate the sitemap + + :returns: response with the ``sitemap.xml`` template rendered + + :rtype: django.http.HttpResponse + """ + + def priorities_generator(): + """ + Generator returning ``priority`` needed by sitemap.xml. + + It generates values from 1 to 0.1 by decreasing in 0.1 on each + iteration. After 0.1 is reached, it will keep returning 0.1. + """ + priorities = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2] + yield from itertools.chain(priorities, itertools.repeat(0.1)) + + def changefreqs_generator(): + """ + Generator returning ``changefreq`` needed by sitemap.xml. + + It returns ``daily`` on first iteration, then ``weekly`` and then it + will return always ``monthly``. + + We are using ``monthly`` as last value because ``never`` is too + aggressive. If the tag is removed and a branch is created with the same + name, we will want bots to revisit this. + """ + changefreqs = ['daily', 'weekly'] + yield from itertools.chain(changefreqs, itertools.repeat('monthly')) + + if project.privacy_level == constants.PRIVATE: + raise Http404 + + sorted_versions = sort_version_aware( + Version.objects.public( + project=project, + only_active=True, + ), + ) + + versions = [] + for version, priority, changefreq in zip( + sorted_versions, + priorities_generator(), + changefreqs_generator(), + ): + element = { + 'loc': version.get_subdomain_url(), + 'priority': priority, + 'changefreq': changefreq, + 'languages': [], + } + + # Version can be enabled, but not ``built`` yet. We want to show the + # link without a ``lastmod`` attribute + last_build = version.builds.order_by('-date').first() + if last_build: + element['lastmod'] = last_build.date.isoformat() + + if project.translations.exists(): + for translation in project.translations.all(): + href = project.get_docs_url( + version_slug=version.slug, + lang_slug=translation.language, + private=version.privacy_level == constants.PRIVATE, + ) + element['languages'].append({ + 'hreflang': translation.language, + 'href': href, + }) + + # Add itself also as protocol requires + element['languages'].append({ + 'hreflang': project.language, + 'href': element['loc'], + }) + + versions.append(element) + + context = { + 'versions': versions, + } + return render( + request, + 'sitemap.xml', + context, + content_type='application/xml', + ) diff --git a/readthedocs/rtd_tests/tests/test_doc_serving.py b/readthedocs/rtd_tests/tests/test_doc_serving.py index 732b7e4554f..2f613da3c38 100644 --- a/readthedocs/rtd_tests/tests/test_doc_serving.py +++ b/readthedocs/rtd_tests/tests/test_doc_serving.py @@ -1,16 +1,17 @@ -# -*- coding: utf-8 -*- + +import os import django_dynamic_fixture as fixture import mock -import os from django.conf import settings from django.contrib.auth.models import User from django.http import Http404 -from django.test import TestCase, RequestFactory +from django.test import RequestFactory, TestCase from django.test.utils import override_settings from django.urls import reverse from mock import mock_open, patch +from readthedocs.builds.models import Version from readthedocs.core.middleware import SubdomainMiddleware from readthedocs.core.views import server_error_404_subdomain from readthedocs.core.views.serve import _serve_symlink_docs @@ -102,6 +103,26 @@ def test_robots_txt(self): # Private projects/versions always return 404 for robots.txt self.assertEqual(response.status_code, 404) + @override_settings( + USE_SUBDOMAIN=True, + PUBLIC_DOMAIN='readthedocs.io', + ROOT_URLCONF=settings.SUBDOMAIN_URLCONF, + ) + def test_sitemap_xml(self): + response = self.client.get( + reverse('sitemap_xml'), + HTTP_HOST='private.readthedocs.io', + ) + self.assertEqual(response.status_code, 404) + + self.client.force_login(self.eric) + response = self.client.get( + reverse('sitemap_xml'), + HTTP_HOST='private.readthedocs.io', + ) + # Private projects/versions always return 404 for robots.txt + self.assertEqual(response.status_code, 404) + @override_settings(SERVE_DOCS=[constants.PRIVATE, constants.PUBLIC]) class TestPublicDocs(BaseDocServing): @@ -149,7 +170,7 @@ def test_default_robots_txt(self): HTTP_HOST='public.readthedocs.io', ) self.assertEqual(response.status_code, 200) - self.assertEqual(response.content, b'User-agent: *\nAllow: /\n') + self.assertEqual(response.content, b'User-agent: *\nAllow: /\nSitemap: https://public.readthedocs.io/sitemap.xml\n') @override_settings( PYTHON_MEDIA=False, @@ -179,6 +200,7 @@ def test_custom_robots_txt(self, os_mock, open_mock): PUBLIC_DOMAIN='readthedocs.io', ROOT_URLCONF=settings.SUBDOMAIN_URLCONF, ) + @patch('readthedocs.core.views.serve.os') @patch('readthedocs.core.views.os') def test_custom_404_page(self, os_view_mock, os_serve_mock): @@ -200,3 +222,41 @@ def test_custom_404_page(self, os_view_mock, os_serve_mock): response = server_error_404_subdomain(request) self.assertEqual(response.status_code, 404) self.assertTrue(response['X-Accel-Redirect'].endswith('/public/en/latest/404.html')) + + @override_settings( + USE_SUBDOMAIN=True, + PUBLIC_DOMAIN='readthedocs.io', + ROOT_URLCONF=settings.SUBDOMAIN_URLCONF, + ) + def test_sitemap_xml(self): + self.public.versions.update(active=True) + private_version = fixture.get( + Version, + privacy_level=constants.PRIVATE, + project=self.public, + ) + response = self.client.get( + reverse('sitemap_xml'), + HTTP_HOST='public.readthedocs.io', + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(response['Content-Type'], 'application/xml') + for version in self.public.versions.filter(privacy_level=constants.PUBLIC): + self.assertContains( + response, + self.public.get_docs_url( + version_slug=version.slug, + lang_slug=self.public.language, + private=False, + ), + ) + + # stable is marked as PRIVATE and should not appear here + self.assertNotContains( + response, + self.public.get_docs_url( + version_slug=private_version.slug, + lang_slug=self.public.language, + private=True, + ), + ) diff --git a/readthedocs/templates/sitemap.xml b/readthedocs/templates/sitemap.xml new file mode 100644 index 00000000000..fa2c89fc57b --- /dev/null +++ b/readthedocs/templates/sitemap.xml @@ -0,0 +1,20 @@ + + + {% for version in versions %} + + {{ version.loc }} + {% for language in version.languages %} + + {% endfor %} + {% if version.lastmod %} + {{ version.lastmod }} + {% endif %} + {{ version.changefreq }} + {{ version.priority }} + + {% endfor %} +