Skip to content

Proxito: handle http to https redirects for all requests #10199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 19, 2023
Merged
3 changes: 2 additions & 1 deletion readthedocs/api/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from readthedocs.core.unresolver import UnresolverError, unresolve
from readthedocs.core.utils import get_cache_tag
from readthedocs.projects.models import Project
from readthedocs.proxito.cache import add_cache_tags

log = structlog.get_logger(__name__)

Expand All @@ -31,7 +32,7 @@ def dispatch(self, request, *args, **kwargs):
response = super().dispatch(request, *args, **kwargs)
cache_tags = self._get_cache_tags()
if cache_tags:
response["Cache-Tag"] = ",".join(cache_tags)
add_cache_tags(response, cache_tags)
return response

def _get_cache_tags(self):
Expand Down
13 changes: 8 additions & 5 deletions readthedocs/core/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from django.contrib.auth.mixins import LoginRequiredMixin
from vanilla import ListView

from readthedocs.proxito.cache import cache_response, private_response


class ListViewWithForm(ListView):

Expand Down Expand Up @@ -53,11 +55,12 @@ class CDNCacheControlMixin:

def dispatch(self, request, *args, **kwargs):
response = super().dispatch(request, *args, **kwargs)
cache_response = self.can_be_cached(request)
if cache_response is not None:
response.headers["CDN-Cache-Control"] = (
"public" if cache_response else "private"
)
can_be_cached = self.can_be_cached(request)
if can_be_cached is not None:
if can_be_cached:
cache_response(response)
else:
private_response(response)
return response

def can_be_cached(self, request):
Expand Down
65 changes: 65 additions & 0 deletions readthedocs/proxito/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import structlog

log = structlog.get_logger(__name__)

CACHE_TAG_HEADER = "Cache-Tag"
CDN_CACHE_CONTROL_HEADER = "CDN-Cache-Control"
Comment on lines +5 to +6
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are using a constants.py on each Django application. Probably makes sense to keep that same pattern here as well?



def add_cache_tags(response, cache_tags):
"""
Add cache tags to the response.

New cache tags will be appended to the existing ones.

:param response: The response to add cache tags to.
:param cache_tags: A list of cache tags to add to the response.
"""
cache_tags = cache_tags.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we copy them for any particular reason?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are extending this list below, we don't want to change the original list.

current_cache_tag = response.headers.get(CACHE_TAG_HEADER)
if current_cache_tag:
cache_tags.append(current_cache_tag)

response.headers[CACHE_TAG_HEADER] = ",".join(cache_tags)


def cache_response(response, cache_tags=None, force=True):
"""
Cache the response at the CDN level.

We add the ``Cache-Tag`` header to the response, to be able to purge
the cache by a given tag. And we set the ``CDN-Cache-Control`` header
to public, to cache the response at the CDN level only. This doesn't
affect caching at the browser level (``Cache-Control``).

See:

- https://developers.cloudflare.com/cache/how-to/purge-cache/#cache-tags-enterprise-only
- https://developers.cloudflare.com/cache/about/cdn-cache-control.

:param response: The response to cache.
:param cache_tags: A list of cache tags to add to the response.
:param force: If ``True``, the header will be set to public even if it
was already set to private.
Comment on lines +42 to +43
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's not clear what force means in this context. Probably it's worth to rename this attribute to something like public= or private= depending what's the best default.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me force is more clear than public or private, since we are forcing the request to be cached or not depending on the method being called.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point is that sometimes force=True means to force to be public and some others force=True means to be private. See https://github.com/readthedocs/readthedocs.org/pull/10199/files/c5a54a93eeb027a4187edae5c5c9610d7fbbe155#diff-47aa393923b9e640f34198f21bb776acd52797a770630d8c5a5bfd63346931c1R61

Something more explicit here would be better in my opinion. If you like the force word, probably force="public" and force="private" is a better middle ground and still pretty clear.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But they are used in different functions, the name of the function already specifies if the request will be forced to be public or private (cache_response, private_response), that's why it's just a boolean, we aren't using one function to cache the response or make it private, we are using two different functions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I saw that. IMO, it's still hard to realize what I am forcing to when passing the boolean, that's why I raised this comment here. I got confused while doing the review.

"""
if cache_tags:
add_cache_tags(response, cache_tags)
if force or CDN_CACHE_CONTROL_HEADER not in response.headers:
if not response.headers.get(CACHE_TAG_HEADER):
log.warning("Caching response without cache tags.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a real case or we are logging this here just for debugging purposes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't make sense to cache a response if we don't have a way to purge it using cache tags.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be good as a comment, or even better in the logging message.


response.headers[CDN_CACHE_CONTROL_HEADER] = "public"


def private_response(response, force=True):
"""
Prevent the response from being cached at the CDN level.

We do this by explicitly setting the ``CDN-Cache-Control`` header to private.

:param response: The response to mark as private.
:param force: If ``True``, the header will be set to private even if it
was already set to public.
Comment on lines +63 to +64
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. As a user calling private_response(response, force) is hard to know/understand/guess that the force= argument does. I'd like if can be explicit here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What other name do you suggest? To me force is clear, especially since the default is always True.

"""
if force or CDN_CACHE_CONTROL_HEADER not in response.headers:
response.headers[CDN_CACHE_CONTROL_HEADER] = "private"
55 changes: 44 additions & 11 deletions readthedocs/proxito/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
)
from readthedocs.core.utils import get_cache_tag
from readthedocs.projects.models import Feature, Project
from readthedocs.proxito.cache import add_cache_tags, cache_response, private_response
from readthedocs.proxito.redirects import redirect_to_https

log = structlog.get_logger(__name__)

Expand Down Expand Up @@ -63,15 +65,14 @@ def add_proxito_headers(self, request, response):
response['X-RTD-Path'] = path

# Include the project & project-version so we can do larger purges if needed
cache_tag = response.get('Cache-Tag')
cache_tags = [cache_tag] if cache_tag else []
cache_tags = []
if project_slug:
cache_tags.append(project_slug)
if version_slug:
cache_tags.append(get_cache_tag(project_slug, version_slug))

if cache_tags:
response['Cache-Tag'] = ','.join(cache_tags)
add_cache_tags(response, cache_tags)

unresolved_domain = request.unresolved_domain
if unresolved_domain:
Expand Down Expand Up @@ -167,22 +168,20 @@ def add_cache_headers(self, request, response):

See https://developers.cloudflare.com/cache/about/cdn-cache-control.
"""
cdn_cache_header = "CDN-Cache-Control"
unresolved_domain = request.unresolved_domain
# Never trust projects resolving from the X-RTD-Slug header,
# we don't want to cache their content on domains from other
# projects, see GHSA-mp38-vprc-7hf5.
if unresolved_domain and unresolved_domain.is_from_http_header:
response.headers[cdn_cache_header] = "private"
private_response(response, force=True)
# SECURITY: Return early, we never want to cache this response.
return

# Set the key only if it hasn't already been set by the view.
if cdn_cache_header not in response.headers:
default_cache_level = (
"private" if settings.ALLOW_PRIVATE_REPOS else "public"
)
response.headers[cdn_cache_header] = default_cache_level
# Mark the response as private or cache it, if it hasn't been marked as so already.
if settings.ALLOW_PRIVATE_REPOS:
private_response(response, force=False)
else:
cache_response(response, force=False)

def _set_request_attributes(self, request, unresolved_domain):
"""
Expand Down Expand Up @@ -233,6 +232,10 @@ def process_request(self, request): # noqa

self._set_request_attributes(request, unresolved_domain)

response = self._get_https_redirect(request)
if response:
return response

# Remove multiple slashes from URL's
if '//' in request.path:
url_parsed = urlparse(request.get_full_path())
Expand Down Expand Up @@ -286,6 +289,36 @@ def add_hosting_integrations_headers(self, request, response):
if project.has_feature(Feature.HOSTING_INTEGRATIONS):
response["X-RTD-Hosting-Integrations"] = "true"

def _get_https_redirect(self, request):
"""
Get a redirect response if the request should be redirected to HTTPS.

A request should be redirected to HTTPS if any of the following conditions are met:

- It's from a custom domain and the domain has HTTPS enabled.
- It's from a public domain, and the public domain uses HTTPS.
"""
if request.is_secure():
return None

unresolved_domain = request.unresolved_domain

# HTTPS redirect for custom domains.
if unresolved_domain.is_from_custom_domain:
domain = unresolved_domain.domain
if domain.https:
return redirect_to_https(request, project=unresolved_domain.project)
return None

# HTTPS redirect for public domains.
if (
unresolved_domain.is_from_public_domain
or unresolved_domain.is_from_external_domain
) and settings.PUBLIC_DOMAIN_USES_HTTPS:
return redirect_to_https(request, project=unresolved_domain.project)

return None

def process_response(self, request, response): # noqa
self.add_proxito_headers(request, response)
self.add_cache_headers(request, response)
Expand Down
91 changes: 91 additions & 0 deletions readthedocs/proxito/redirects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from urllib.parse import urlparse

import structlog
from django.http import HttpResponseRedirect

from readthedocs.core.resolver import resolver
from readthedocs.core.utils.url import unsafe_join_url_path
from readthedocs.proxito.cache import cache_response
from readthedocs.proxito.constants import RedirectType
from readthedocs.redirects.exceptions import InfiniteRedirectException

log = structlog.get_logger(__name__)


def redirect_to_https(request, project):
"""
Shortcut to get a https redirect.

:param request: Request object.
:param project: The current project being served
"""
return canonical_redirect(request, project, RedirectType.http_to_https)


def canonical_redirect(request, project, redirect_type, external_version_slug=None):
"""
Return a canonical redirect response.

All redirects are cached, since the final URL will be checked for authorization.

The following cases are covered:

- Redirect a custom domain from http to https
http://project.rtd.io/ -> https://project.rtd.io/
- Redirect a domain to a canonical domain (http or https).
http://project.rtd.io/ -> https://docs.test.com/
http://project.rtd.io/foo/bar/ -> https://docs.test.com/foo/bar/
- Redirect from a subproject domain to the main domain
https://subproject.rtd.io/en/latest/foo -> https://main.rtd.io/projects/subproject/en/latest/foo # noqa
https://subproject.rtd.io/en/latest/foo -> https://docs.test.com/projects/subproject/en/latest/foo # noqa

Raises ``InfiniteRedirectException`` if the redirect is the same as the current URL.

:param request: Request object.
:param project: The current project being served
:param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain)
:param external_version_slug: The version slug if the request is from a pull request preview.
"""
from_url = request.build_absolute_uri()
parsed_from = urlparse(from_url)

if redirect_type == RedirectType.http_to_https:
# We only need to change the protocol.
to = parsed_from._replace(scheme="https").geturl()
elif redirect_type == RedirectType.to_canonical_domain:
# We need to change the domain and protocol.
canonical_domain = project.get_canonical_custom_domain()
protocol = "https" if canonical_domain.https else "http"
to = parsed_from._replace(
scheme=protocol, netloc=canonical_domain.domain
).geturl()
elif redirect_type == RedirectType.subproject_to_main_domain:
# We need to get the subproject root in the domain of the main
# project, and append the current path.
project_doc_prefix = resolver.get_url_prefix(
project=project,
external_version_slug=external_version_slug,
)
parsed_doc_prefix = urlparse(project_doc_prefix)
to = parsed_doc_prefix._replace(
path=unsafe_join_url_path(parsed_doc_prefix.path, parsed_from.path),
query=parsed_from.query,
).geturl()
else:
raise NotImplementedError

if from_url == to:
# check that we do have a response and avoid infinite redirect
log.warning(
"Infinite Redirect: FROM URL is the same than TO URL.",
url=to,
)
raise InfiniteRedirectException()

log.info(
"Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to
)
resp = HttpResponseRedirect(to)
resp["X-RTD-Redirect"] = redirect_type.name
cache_response(resp, cache_tags=[project.slug])
return resp
8 changes: 6 additions & 2 deletions readthedocs/proxito/tests/test_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,18 @@ def test_cache_headers_robots_txt_with_private_projects_allowed(self):

@override_settings(ALLOW_PRIVATE_REPOS=False)
def test_cache_headers_robots_txt_with_private_projects_not_allowed(self):
r = self.client.get("/sitemap.xml", HTTP_HOST="project.dev.readthedocs.io")
r = self.client.get(
"/sitemap.xml", secure=True, HTTP_HOST="project.dev.readthedocs.io"
)
self.assertEqual(r.status_code, 200)
self.assertEqual(r["CDN-Cache-Control"], "public")
self.assertEqual(r["Cache-Tag"], "project,project:sitemap.xml")

@override_settings(ALLOW_PRIVATE_REPOS=True)
def test_cache_headers_robots_txt_with_private_projects_allowed(self):
r = self.client.get("/sitemap.xml", HTTP_HOST="project.dev.readthedocs.io")
r = self.client.get(
"/sitemap.xml", secure=True, HTTP_HOST="project.dev.readthedocs.io"
)
self.assertEqual(r.status_code, 200)
self.assertEqual(r["CDN-Cache-Control"], "public")
self.assertEqual(r["Cache-Tag"], "project,project:sitemap.xml")
Expand Down
16 changes: 9 additions & 7 deletions readthedocs/proxito/tests/test_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ def test_translation_secure_redirect(self):

# Specific Page Redirects
def test_proper_page_on_subdomain(self):
r = self.client.get('/page/test.html', HTTP_HOST='project.dev.readthedocs.io')
r = self.client.get(
"/page/test.html", secure=True, HTTP_HOST="project.dev.readthedocs.io"
)
self.assertEqual(r.status_code, 302)
self.assertEqual(
r['Location'],
Expand All @@ -240,43 +242,43 @@ def test_slash_redirect(self):
host = 'project.dev.readthedocs.io'

url = '/en/latest////awesome.html'
resp = self.client.get(url, HTTP_HOST=host)
resp = self.client.get(url, secure=True, HTTP_HOST=host)
self.assertEqual(resp.status_code, 302)
self.assertEqual(
resp['Location'], '/en/latest/awesome.html',
)

url = '/en/latest////awesome.html'
resp = self.client.get(url, HTTP_HOST=host)
resp = self.client.get(url, secure=True, HTTP_HOST=host)
self.assertEqual(resp.status_code, 302)
self.assertEqual(
resp['Location'], '/en/latest/awesome.html',
)

url = '/en/latest////awesome///index.html'
resp = self.client.get(url, HTTP_HOST=host)
resp = self.client.get(url, secure=True, HTTP_HOST=host)
self.assertEqual(resp.status_code, 302)
self.assertEqual(
resp['Location'], '/en/latest/awesome/index.html',
)

url = '/en/latest////awesome///index.html?foo=bar'
resp = self.client.get(url, HTTP_HOST=host)
resp = self.client.get(url, secure=True, HTTP_HOST=host)
self.assertEqual(resp.status_code, 302)
self.assertEqual(
resp['Location'], '/en/latest/awesome/index.html?foo=bar',
)

url = '/en/latest////awesome///'
resp = self.client.get(url, HTTP_HOST=host)
resp = self.client.get(url, secure=True, HTTP_HOST=host)
self.assertEqual(resp.status_code, 302)
self.assertEqual(
resp['Location'], '/en/latest/awesome/',
)

# Don't change the values of params
url = '/en/latest////awesome///index.html?foo=bar//bas'
resp = self.client.get(url, HTTP_HOST=host)
resp = self.client.get(url, secure=True, HTTP_HOST=host)
self.assertEqual(resp.status_code, 302)
self.assertEqual(
resp['Location'], '/en/latest/awesome/index.html?foo=bar//bas',
Expand Down
Loading