From 3fc254fcde1282e151b15652bb0424fa2a337d17 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 10 Jan 2019 12:31:48 +0100 Subject: [PATCH 01/10] Support custom robots.txt Check for a custom `robots.txt` on the default version and if it does exist serve it. Otherwise, return 404. --- readthedocs/core/urls/subdomain.py | 6 ++++- readthedocs/core/views/serve.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/readthedocs/core/urls/subdomain.py b/readthedocs/core/urls/subdomain.py index 826c6443660..56f981de71e 100644 --- a/readthedocs/core/urls/subdomain.py +++ b/readthedocs/core/urls/subdomain.py @@ -10,7 +10,7 @@ from readthedocs.core.views.serve import ( redirect_page_with_filename, - redirect_project_slug, serve_docs + redirect_project_slug, serve_docs, robots_txt, ) from readthedocs.core.views import ( server_error_500, @@ -22,6 +22,10 @@ handler404 = server_error_404 subdomain_urls = [ + url((r'robots.txt$'.format(**pattern_opts)), + robots_txt, + name='robots_txt'), + url(r'^(?:|projects/(?P{project_slug})/)' r'page/(?P.*)$'.format(**pattern_opts), redirect_page_with_filename, diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index f7741631e5e..9a0aff804e4 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -223,3 +223,38 @@ def _serve_symlink_docs(request, project, privacy_level, filename=''): raise Http404( 'File not found. Tried these files: %s' % ','.join(files_tried)) + + +@map_project_slug +def robots_txt(request, project): + """ + Serve custom user's defined ``/robots.txt``. + + If the user added a ``robots.txt`` in the "default version" of the project, + we serve it directly. + """ + if project.privacy_level == constants.PRIVATE: + # If project is private, there is nothing to communicate to the bots. + raise Http404() + + # Use the ``robots.txt`` file from the default version configured + version_slug = project.get_default_version() + + filename = resolve_path( + project, + version_slug=version_slug, + filename='robots.txt', + subdomain=True, # subdomain will make it a "full" path without a URL prefix + ) + + # This breaks path joining, by ignoring the root when given an "absolute" path + if filename[0] == '/': + filename = filename[1:] + + basepath = PublicSymlink(project).project_root + fullpath = os.path.join(basepath, filename) + + if os.path.exists(fullpath): + return HttpResponse(open(fullpath).read(), content_type='text/plain') + + raise Http404() From 5d2c24c69c29dd6b17dfa3d42f840080be482e33 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 14 Jan 2019 11:15:02 +0100 Subject: [PATCH 02/10] Proper URL formatting --- readthedocs/core/urls/subdomain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/readthedocs/core/urls/subdomain.py b/readthedocs/core/urls/subdomain.py index 56f981de71e..23f1553245f 100644 --- a/readthedocs/core/urls/subdomain.py +++ b/readthedocs/core/urls/subdomain.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + """URL configurations for subdomains.""" from __future__ import absolute_import @@ -22,9 +24,7 @@ handler404 = server_error_404 subdomain_urls = [ - url((r'robots.txt$'.format(**pattern_opts)), - robots_txt, - name='robots_txt'), + url(r'robots.txt$', robots_txt, name='robots_txt'), url(r'^(?:|projects/(?P{project_slug})/)' r'page/(?P.*)$'.format(**pattern_opts), From 65c89042cab8de6f4ad2ac6a5c1fbd3c649e6e65 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 14 Jan 2019 11:15:27 +0100 Subject: [PATCH 03/10] Explicit Allow/Disallow instead of 404 --- readthedocs/core/views/serve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index 9a0aff804e4..828b6bb9a82 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -234,8 +234,8 @@ def robots_txt(request, project): we serve it directly. """ if project.privacy_level == constants.PRIVATE: - # If project is private, there is nothing to communicate to the bots. - raise Http404() + # If project is private, we disallow the whole site + raise HttpResponse('User-agent: *\nDisallow: /\n') # Use the ``robots.txt`` file from the default version configured version_slug = project.get_default_version() @@ -257,4 +257,4 @@ def robots_txt(request, project): if os.path.exists(fullpath): return HttpResponse(open(fullpath).read(), content_type='text/plain') - raise Http404() + raise HttpResponse('User-agent: *\nAllow: /\n') From 52588a39bc3cafa60a00d264f1a0a2143e6019e1 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 14 Jan 2019 11:52:58 +0100 Subject: [PATCH 04/10] FAQ documentation for robots.txt file --- docs/faq.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/docs/faq.rst b/docs/faq.rst index 30025963c18..da08605d975 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -230,3 +230,49 @@ What commit of Read the Docs is in production? ---------------------------------------------- We deploy readthedocs.org from the `rel` branch in our GitHub repository. You can see the latest commits that have been deployed by looking on GitHub: https://github.com/rtfd/readthedocs.org/commits/rel + + +How can I avoid bot crawlers to show a deprecated version of my docs? +--------------------------------------------------------------------- + +If readers search something related to your docs in Google, it will probably return the most relevant version of your documentation. +It may happen that this version is already deprecated and you want to stop Google indexing it as a result, +and start suggesting the latest (or newer) one. + +To accomplish this, you can add a ``robots.txt`` file to your documentation's root so it ends up served at the root URL of your project +(for example, https://yourproject.readthedocs.io/robots.txt). + + +Minimal example of ``robots.txt`` ++++++++++++++++++++++++++++++++++ + +:: + + User-agent: * + Disallow: /en/deprecated-version/ + Disallow: /en/2.0/ + +.. note:: + + See `Google's docs`_ for its full syntax. + +This file has to be served as is under ``/robots.txt``. +Depending if you are using Sphinx or MkDocs, you will need a different configuration for this. + + +Sphinx +~~~~~~ + +Sphinx uses `html_extra`_ option to add static files to the output. +You need to create a ``robots.txt`` file and put it under the path defined in ``html_extra``. + + +MkDocs +~~~~~~ + +MkDocs needs the ``robots.txt`` to be at the directory defined at `docs_dir`_ config. + + +.. _Google's docs: https://support.google.com/webmasters/answer/6062608 +.. _html_extra: https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-html_extra_path +.. _docs_dir: https://www.mkdocs.org/user-guide/configuration/#docs_dir From 988c46bf34cad9fe0c12959e0f860ed25bd9e0cf Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 14 Jan 2019 16:52:15 +0100 Subject: [PATCH 05/10] Use return for HttpResponse --- readthedocs/core/views/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index 828b6bb9a82..5d852bc2d06 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -257,4 +257,4 @@ def robots_txt(request, project): if os.path.exists(fullpath): return HttpResponse(open(fullpath).read(), content_type='text/plain') - raise HttpResponse('User-agent: *\nAllow: /\n') + return HttpResponse('User-agent: *\nAllow: /\n') From 5ccc878ee5e98ef683922c6d060ba2a9d453a6e2 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 14 Jan 2019 16:52:35 +0100 Subject: [PATCH 06/10] Do not serve the robots.txt file if private or version not active/built --- readthedocs/core/views/serve.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index 5d852bc2d06..27e59978004 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -233,12 +233,23 @@ def robots_txt(request, project): If the user added a ``robots.txt`` in the "default version" of the project, we serve it directly. """ - if project.privacy_level == constants.PRIVATE: - # If project is private, we disallow the whole site - raise HttpResponse('User-agent: *\nDisallow: /\n') - # Use the ``robots.txt`` file from the default version configured version_slug = project.get_default_version() + version = project.versions.get(slug=version_slug) + + no_serve_robots_txt = any( + # If project is private or, + project.privacy_level == constants.PRIVATE, + # default version is private or, + version.privacy_level == constants.PRIVATE, + # default version is not active or, + not version.active, + # default version is not built + not version.built, + ) + if no_serve_robots_txt: + # ... we do return a 404 + raise Http404() filename = resolve_path( project, From 38d812080fca6727d7c80f999f1067e0e96d48d6 Mon Sep 17 00:00:00 2001 From: Eric Holscher <25510+ericholscher@users.noreply.github.com> Date: Mon, 14 Jan 2019 16:02:45 +0100 Subject: [PATCH 07/10] Rephrase question Co-Authored-By: humitos --- docs/faq.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/faq.rst b/docs/faq.rst index da08605d975..c782632012f 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -232,7 +232,7 @@ What commit of Read the Docs is in production? We deploy readthedocs.org from the `rel` branch in our GitHub repository. You can see the latest commits that have been deployed by looking on GitHub: https://github.com/rtfd/readthedocs.org/commits/rel -How can I avoid bot crawlers to show a deprecated version of my docs? +How can I avoid search results having a deprecated version of my docs? --------------------------------------------------------------------- If readers search something related to your docs in Google, it will probably return the most relevant version of your documentation. From 47248113d7220bf2cbfa83cf3ab5854fda3d8e20 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 14 Jan 2019 17:13:35 +0100 Subject: [PATCH 08/10] Add content_type to default response --- readthedocs/core/views/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index 27e59978004..4ac2c7e864e 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -268,4 +268,4 @@ def robots_txt(request, project): if os.path.exists(fullpath): return HttpResponse(open(fullpath).read(), content_type='text/plain') - return HttpResponse('User-agent: *\nAllow: /\n') + return HttpResponse('User-agent: *\nAllow: /\n', content_type='text/plain') From 19b52092e21de6f616ba7cd50d67688bcccb043d Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Wed, 16 Jan 2019 16:51:10 +0100 Subject: [PATCH 09/10] any() call fixed --- readthedocs/core/views/serve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py index 4ac2c7e864e..2b24b45d95c 100644 --- a/readthedocs/core/views/serve.py +++ b/readthedocs/core/views/serve.py @@ -237,7 +237,7 @@ def robots_txt(request, project): version_slug = project.get_default_version() version = project.versions.get(slug=version_slug) - no_serve_robots_txt = any( + no_serve_robots_txt = any([ # If project is private or, project.privacy_level == constants.PRIVATE, # default version is private or, @@ -246,7 +246,7 @@ def robots_txt(request, project): not version.active, # default version is not built not version.built, - ) + ]) if no_serve_robots_txt: # ... we do return a 404 raise Http404() From 3e4b1a43b00be793f4b9b755c5a9dc94e2d8a5c3 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Wed, 16 Jan 2019 16:51:30 +0100 Subject: [PATCH 10/10] Tests for serving default and custom robots.txt file (public/private) --- .../rtd_tests/tests/test_doc_serving.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/readthedocs/rtd_tests/tests/test_doc_serving.py b/readthedocs/rtd_tests/tests/test_doc_serving.py index 9a5176aec21..56798ad8499 100644 --- a/readthedocs/rtd_tests/tests/test_doc_serving.py +++ b/readthedocs/rtd_tests/tests/test_doc_serving.py @@ -2,13 +2,17 @@ from __future__ import absolute_import, unicode_literals, division, print_function import mock +from mock import patch, mock_open import django_dynamic_fixture as fixture +import pytest +import six from django.contrib.auth.models import User from django.test import TestCase from django.test.utils import override_settings from django.http import Http404 from django.conf import settings +from django.urls import reverse from readthedocs.rtd_tests.base import RequestFactoryTestMixin from readthedocs.projects import constants @@ -77,6 +81,28 @@ def test_private_files_not_found(self): self.assertTrue('private_web_root' in str(exc.exception)) self.assertTrue('public_web_root' not in str(exc.exception)) + @override_settings( + PYTHON_MEDIA=False, + USE_SUBDOMAIN=True, + PUBLIC_DOMAIN='readthedocs.io', + ROOT_URLCONF=settings.SUBDOMAIN_URLCONF, + ) + def test_robots_txt(self): + self.public.versions.update(active=True, built=True) + response = self.client.get( + reverse('robots_txt'), + HTTP_HOST='private.readthedocs.io', + ) + self.assertEqual(response.status_code, 404) + + self.client.force_login(self.eric) + response = self.client.get( + reverse('robots_txt'), + HTTP_HOST='private.readthedocs.io', + ) + # Private projects/versions always return 404 for robots.txt + self.assertEqual(response.status_code, 404) + @override_settings(SERVE_DOCS=[constants.PRIVATE, constants.PUBLIC]) class TestPublicDocs(BaseDocServing): @@ -110,3 +136,41 @@ def test_both_files_not_found(self): _serve_symlink_docs(request, project=self.private, filename='/en/latest/usage.html', privacy_level='public') self.assertTrue('private_web_root' not in str(exc.exception)) self.assertTrue('public_web_root' in str(exc.exception)) + + @override_settings( + PYTHON_MEDIA=False, + USE_SUBDOMAIN=True, + PUBLIC_DOMAIN='readthedocs.io', + ROOT_URLCONF=settings.SUBDOMAIN_URLCONF, + ) + def test_default_robots_txt(self): + self.public.versions.update(active=True, built=True) + response = self.client.get( + reverse('robots_txt'), + HTTP_HOST='public.readthedocs.io', + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.content, b'User-agent: *\nAllow: /\n') + + @override_settings( + PYTHON_MEDIA=False, + USE_SUBDOMAIN=True, + PUBLIC_DOMAIN='readthedocs.io', + ROOT_URLCONF=settings.SUBDOMAIN_URLCONF, + ) + @patch( + 'builtins.open', + new_callable=mock_open, + read_data='My own robots.txt', + ) + @patch('readthedocs.core.views.serve.os') + @pytest.mark.skipif(six.PY2, reason='In Python2 the mock is __builtins__.open') + def test_custom_robots_txt(self, os_mock, open_mock): + os_mock.path.exists.return_value = True + self.public.versions.update(active=True, built=True) + response = self.client.get( + reverse('robots_txt'), + HTTP_HOST='public.readthedocs.io', + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.content, b'My own robots.txt')