Skip to content

Generate general sitemap.xml for projects #5122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Feb 19, 2019
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/sitemaps.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Sitemaps
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this linked from anywhere? Should be in an toctree somewhere.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I wanted to link it under Feature Documentation but I forgot to do it.

I will move this file under docs/features/sitemaps.rst and will be linked automatically on that section.

========

Sitemaps_ allows us to inform search engines about URLs that are available for crawling
and communicate them additional information about each URL of the project:

* when it was last updated,
* how often it changes,
* how important it is in relation to other URLs in the site, and
* what translations are available for a page.

Read the Docs automatically generates a sitemap for each project that hosts
to improve results when performing a search on these search engines.
This allow us to prioritize results based on the version number, for example
to show ``latest`` as the top result followed by ``stable`` and then all the project's
versions sorted following semver_.

.. _semver: https://semver.org/
.. _Sitemaps: https://www.sitemaps.org/
6 changes: 3 additions & 3 deletions readthedocs/core/urls/subdomain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

"""URL configurations for subdomains."""
from functools import reduce
from operator import add
Expand All @@ -15,14 +13,16 @@
redirect_project_slug,
robots_txt,
serve_docs,
sitemap_xml,
)


handler500 = server_error_500
handler404 = server_error_404

subdomain_urls = [
url(r'robots.txt$', robots_txt, name='robots_txt'),
url(r'robots\.txt$', robots_txt, name='robots_txt'),
url(r'sitemap\.xml$', sitemap_xml, name='sitemap_xml'),
url(
r'^(?:|projects/(?P<subproject_slug>{project_slug})/)'
r'page/(?P<filename>.*)$'.format(**pattern_opts),
Expand Down
126 changes: 122 additions & 4 deletions readthedocs/core/views/serve.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

"""
Doc serving from Python.

Expand All @@ -26,6 +24,7 @@
SERVE_DOCS (['private']) - The list of ['private', 'public'] docs to serve.
"""

import itertools
import logging
import mimetypes
import os
Expand All @@ -36,6 +35,7 @@
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.shortcuts import get_object_or_404, render
from django.utils.encoding import iri_to_uri
from django.views.decorators.cache import cache_page
from django.views.static import serve

from readthedocs.builds.models import Version
Expand All @@ -44,6 +44,7 @@
from readthedocs.core.symlink import PrivateSymlink, PublicSymlink
from readthedocs.projects import constants
from readthedocs.projects.models import Project, ProjectRelationship
from readthedocs.projects.templatetags.projects_tags import sort_version_aware


log = logging.getLogger(__name__)
Expand Down Expand Up @@ -262,7 +263,7 @@ def _serve_symlink_docs(request, project, privacy_level, filename=''):
files_tried.append(os.path.join(basepath, filename))

raise Http404(
'File not found. Tried these files: %s' % ','.join(files_tried),
'File not found. Tried these files: {}'.format(','.join(files_tried)),
)


Expand Down Expand Up @@ -309,4 +310,121 @@ def robots_txt(request, project):
if os.path.exists(fullpath):
return HttpResponse(open(fullpath).read(), content_type='text/plain')

return HttpResponse('User-agent: *\nAllow: /\n', content_type='text/plain')
sitemap_url = '{scheme}://{domain}/sitemap.xml'.format(
scheme='https',
domain=project.subdomain(),
)
return HttpResponse(
'User-agent: *\nAllow: /\nSitemap: {}\n'.format(sitemap_url),
content_type='text/plain',
)


@map_project_slug
@cache_page(60 * 60 * 24 * 3) # 3 days
def sitemap_xml(request, project):
"""
Generate and serve a ``sitemap.xml`` for a particular ``project``.

The sitemap is generated from all the ``active`` and public versions of
``project``. These versions are sorted by using semantic versioning
prepending ``latest`` and ``stable`` (if they are enabled) at the beginning.

Following this order, the versions are assigned priorities and change
frequency. Starting from 1 and decreasing by 0.1 for priorities and starting
from daily, weekly to monthly for change frequency.

If the project is private, the view raises ``Http404``. On the other hand,
if the project is public but a version is private, this one is not included
in the sitemap.

:param request: Django request object
:param project: Project instance to generate the sitemap

:returns: response with the ``sitemap.xml`` template rendered

:rtype: django.http.HttpResponse
"""

def priorities_generator():
"""
Generator returning ``priority`` needed by sitemap.xml.

It generates values from 1 to 0.1 by decreasing in 0.1 on each
iteration. After 0.1 is reached, it will keep returning 0.1.
"""
priorities = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
yield from itertools.chain(priorities, itertools.repeat(0.1))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yield from is not Python2 syntax compatible :(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo on line 288: change change
and line 293: this one i not

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed!


def changefreqs_generator():
"""
Generator returning ``changefreq`` needed by sitemap.xml.

It returns ``daily`` on first iteration, then ``weekly`` and then it
will return always ``monthly``.

We are using ``monthly`` as last value because ``never`` is too
aggressive. If the tag is removed and a branch is created with the same
name, we will want bots to revisit this.
"""
changefreqs = ['daily', 'weekly']
yield from itertools.chain(changefreqs, itertools.repeat('monthly'))

if project.privacy_level == constants.PRIVATE:
raise Http404

sorted_versions = sort_version_aware(
project.versions.filter(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason not to use public( on the queryset here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it also defaults to only active projects, but can pass only_active=True also

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No specific reason. I just changed to Version.objects.public(project=project, only_active=True)

active=True,
privacy_level=constants.PUBLIC,
),
)

versions = []
for version, priority, changefreq in zip(
sorted_versions,
priorities_generator(),
changefreqs_generator(),
):
element = {
'loc': version.get_subdomain_url(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The URL should be properly escaped: https://www.sitemaps.org/protocol.html#escaping

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if there is something we need to do here, actually.

'priority': priority,
'changefreq': changefreq,
'languages': [],
}

# Version can be enabled, but not ``built`` yet. We want to show the
# link without a ``lastmod`` attribute
last_build = version.builds.order_by('-date').first()
if last_build:
element['lastmod'] = last_build.date.isoformat()

if project.translations.exists():
for translation in project.translations.all():
href = project.get_docs_url(
version_slug=version.slug,
lang_slug=translation.language,
private=version.privacy_level == constants.PRIVATE,
)
element['languages'].append({
'hreflang': translation.language,
'href': href,
})

# Add itself also as protocol requires
element['languages'].append({
'hreflang': project.language,
'href': element['loc'],
})

versions.append(element)

context = {
'versions': versions,
}
return render(
request,
'sitemap.xml',
context,
content_type='application/xml',
)
68 changes: 64 additions & 4 deletions readthedocs/rtd_tests/tests/test_doc_serving.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# -*- coding: utf-8 -*-

import os

import django_dynamic_fixture as fixture
import mock
import os
from django.conf import settings
from django.contrib.auth.models import User
from django.http import Http404
from django.test import TestCase, RequestFactory
from django.test import RequestFactory, TestCase
from django.test.utils import override_settings
from django.urls import reverse
from mock import mock_open, patch

from readthedocs.builds.models import Version
from readthedocs.core.middleware import SubdomainMiddleware
from readthedocs.core.views import server_error_404_subdomain
from readthedocs.core.views.serve import _serve_symlink_docs
Expand Down Expand Up @@ -102,6 +103,26 @@ def test_robots_txt(self):
# Private projects/versions always return 404 for robots.txt
self.assertEqual(response.status_code, 404)

@override_settings(
USE_SUBDOMAIN=True,
PUBLIC_DOMAIN='readthedocs.io',
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
)
def test_sitemap_xml(self):
response = self.client.get(
reverse('sitemap_xml'),
HTTP_HOST='private.readthedocs.io',
)
self.assertEqual(response.status_code, 404)

self.client.force_login(self.eric)
response = self.client.get(
reverse('sitemap_xml'),
HTTP_HOST='private.readthedocs.io',
)
# Private projects/versions always return 404 for robots.txt
self.assertEqual(response.status_code, 404)


@override_settings(SERVE_DOCS=[constants.PRIVATE, constants.PUBLIC])
class TestPublicDocs(BaseDocServing):
Expand Down Expand Up @@ -149,7 +170,7 @@ def test_default_robots_txt(self):
HTTP_HOST='public.readthedocs.io',
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, b'User-agent: *\nAllow: /\n')
self.assertEqual(response.content, b'User-agent: *\nAllow: /\nSitemap: https://public.readthedocs.io/sitemap.xml\n')

@override_settings(
PYTHON_MEDIA=False,
Expand Down Expand Up @@ -179,6 +200,7 @@ def test_custom_robots_txt(self, os_mock, open_mock):
PUBLIC_DOMAIN='readthedocs.io',
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
)

@patch('readthedocs.core.views.serve.os')
@patch('readthedocs.core.views.os')
def test_custom_404_page(self, os_view_mock, os_serve_mock):
Expand All @@ -200,3 +222,41 @@ def test_custom_404_page(self, os_view_mock, os_serve_mock):
response = server_error_404_subdomain(request)
self.assertEqual(response.status_code, 404)
self.assertTrue(response['X-Accel-Redirect'].endswith('/public/en/latest/404.html'))

@override_settings(
USE_SUBDOMAIN=True,
PUBLIC_DOMAIN='readthedocs.io',
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
)
def test_sitemap_xml(self):
self.public.versions.update(active=True)
private_version = fixture.get(
Version,
privacy_level=constants.PRIVATE,
project=self.public,
)
response = self.client.get(
reverse('sitemap_xml'),
HTTP_HOST='public.readthedocs.io',
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response['Content-Type'], 'application/xml')
for version in self.public.versions.filter(privacy_level=constants.PUBLIC):
self.assertContains(
response,
self.public.get_docs_url(
version_slug=version.slug,
lang_slug=self.public.language,
private=False,
),
)

# stable is marked as PRIVATE and should not appear here
self.assertNotContains(
response,
self.public.get_docs_url(
version_slug=private_version.slug,
lang_slug=self.public.language,
private=True,
),
)
20 changes: 20 additions & 0 deletions readthedocs/templates/sitemap.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml">
{% for version in versions %}
<url>
<loc>{{ version.loc }}</loc>
{% for language in version.languages %}
<xhtml:link
rel="alternate"
hreflang="{{ language.hreflang }}"
href="{{ language.href }}"/>
{% endfor %}
{% if version.lastmod %}
<lastmod>{{ version.lastmod }}</lastmod>
{% endif %}
<changefreq>{{ version.changefreq }}</changefreq>
<priority>{{ version.priority }}</priority>
</url>
{% endfor %}
</urlset>