Skip to content

Commit 121fa70

Browse files
authored
Proxito: handle http to https redirects for all requests (#10199)
In order to be able to cache the redirects, I had to refactor some of our code (and kind of a little in the direction of #10124). Closes #10144
1 parent d4d00af commit 121fa70

File tree

11 files changed

+336
-139
lines changed

11 files changed

+336
-139
lines changed

readthedocs/api/mixins.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from readthedocs.core.unresolver import UnresolverError, unresolve
88
from readthedocs.core.utils import get_cache_tag
99
from readthedocs.projects.models import Project
10+
from readthedocs.proxito.cache import add_cache_tags
1011

1112
log = structlog.get_logger(__name__)
1213

@@ -31,7 +32,7 @@ def dispatch(self, request, *args, **kwargs):
3132
response = super().dispatch(request, *args, **kwargs)
3233
cache_tags = self._get_cache_tags()
3334
if cache_tags:
34-
response["Cache-Tag"] = ",".join(cache_tags)
35+
add_cache_tags(response, cache_tags)
3536
return response
3637

3738
def _get_cache_tags(self):

readthedocs/core/mixins.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from django.contrib.auth.mixins import LoginRequiredMixin
33
from vanilla import ListView
44

5+
from readthedocs.proxito.cache import cache_response, private_response
6+
57

68
class ListViewWithForm(ListView):
79

@@ -53,11 +55,12 @@ class CDNCacheControlMixin:
5355

5456
def dispatch(self, request, *args, **kwargs):
5557
response = super().dispatch(request, *args, **kwargs)
56-
cache_response = self.can_be_cached(request)
57-
if cache_response is not None:
58-
response.headers["CDN-Cache-Control"] = (
59-
"public" if cache_response else "private"
60-
)
58+
can_be_cached = self.can_be_cached(request)
59+
if can_be_cached is not None:
60+
if can_be_cached:
61+
cache_response(response)
62+
else:
63+
private_response(response)
6164
return response
6265

6366
def can_be_cached(self, request):

readthedocs/proxito/cache.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import structlog
2+
3+
log = structlog.get_logger(__name__)
4+
5+
CACHE_TAG_HEADER = "Cache-Tag"
6+
CDN_CACHE_CONTROL_HEADER = "CDN-Cache-Control"
7+
8+
9+
def add_cache_tags(response, cache_tags):
10+
"""
11+
Add cache tags to the response.
12+
13+
New cache tags will be appended to the existing ones.
14+
15+
:param response: The response to add cache tags to.
16+
:param cache_tags: A list of cache tags to add to the response.
17+
"""
18+
cache_tags = cache_tags.copy()
19+
current_cache_tag = response.headers.get(CACHE_TAG_HEADER)
20+
if current_cache_tag:
21+
cache_tags.append(current_cache_tag)
22+
23+
response.headers[CACHE_TAG_HEADER] = ",".join(cache_tags)
24+
25+
26+
def cache_response(response, cache_tags=None, force=True):
27+
"""
28+
Cache the response at the CDN level.
29+
30+
We add the ``Cache-Tag`` header to the response, to be able to purge
31+
the cache by a given tag. And we set the ``CDN-Cache-Control: public`` header
32+
to cache the response at the CDN level only. This doesn't
33+
affect caching at the browser level (``Cache-Control``).
34+
35+
See:
36+
37+
- https://developers.cloudflare.com/cache/how-to/purge-cache/#cache-tags-enterprise-only
38+
- https://developers.cloudflare.com/cache/about/cdn-cache-control.
39+
40+
:param response: The response to cache.
41+
:param cache_tags: A list of cache tags to add to the response.
42+
:param force: If ``True``, the header will be set to public even if it
43+
was already set to private.
44+
"""
45+
if cache_tags:
46+
add_cache_tags(response, cache_tags)
47+
if force or CDN_CACHE_CONTROL_HEADER not in response.headers:
48+
if not response.headers.get(CACHE_TAG_HEADER):
49+
# Caching a response at the CDN level without cache tags is
50+
# incorrect, since we won't be able to purge it otherwise.
51+
log.warning("Caching response without cache tags.")
52+
53+
response.headers[CDN_CACHE_CONTROL_HEADER] = "public"
54+
55+
56+
def private_response(response, force=True):
57+
"""
58+
Prevent the response from being cached at the CDN level.
59+
60+
We do this by explicitly setting the ``CDN-Cache-Control`` header to private.
61+
62+
:param response: The response to mark as private.
63+
:param force: If ``True``, the header will be set to private even if it
64+
was already set to public.
65+
"""
66+
if force or CDN_CACHE_CONTROL_HEADER not in response.headers:
67+
response.headers[CDN_CACHE_CONTROL_HEADER] = "private"

readthedocs/proxito/middleware.py

+48-12
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
)
2828
from readthedocs.core.utils import get_cache_tag
2929
from readthedocs.projects.models import Feature, Project
30+
from readthedocs.proxito.cache import add_cache_tags, cache_response, private_response
31+
from readthedocs.proxito.redirects import redirect_to_https
3032

3133
log = structlog.get_logger(__name__)
3234

@@ -63,15 +65,14 @@ def add_proxito_headers(self, request, response):
6365
response['X-RTD-Path'] = path
6466

6567
# Include the project & project-version so we can do larger purges if needed
66-
cache_tag = response.get('Cache-Tag')
67-
cache_tags = [cache_tag] if cache_tag else []
68+
cache_tags = []
6869
if project_slug:
6970
cache_tags.append(project_slug)
7071
if version_slug:
7172
cache_tags.append(get_cache_tag(project_slug, version_slug))
7273

7374
if cache_tags:
74-
response['Cache-Tag'] = ','.join(cache_tags)
75+
add_cache_tags(response, cache_tags)
7576

7677
unresolved_domain = request.unresolved_domain
7778
if unresolved_domain:
@@ -167,22 +168,20 @@ def add_cache_headers(self, request, response):
167168
168169
See https://developers.cloudflare.com/cache/about/cdn-cache-control.
169170
"""
170-
cdn_cache_header = "CDN-Cache-Control"
171171
unresolved_domain = request.unresolved_domain
172172
# Never trust projects resolving from the X-RTD-Slug header,
173173
# we don't want to cache their content on domains from other
174174
# projects, see GHSA-mp38-vprc-7hf5.
175175
if unresolved_domain and unresolved_domain.is_from_http_header:
176-
response.headers[cdn_cache_header] = "private"
176+
private_response(response, force=True)
177177
# SECURITY: Return early, we never want to cache this response.
178178
return
179179

180-
# Set the key only if it hasn't already been set by the view.
181-
if cdn_cache_header not in response.headers:
182-
default_cache_level = (
183-
"private" if settings.ALLOW_PRIVATE_REPOS else "public"
184-
)
185-
response.headers[cdn_cache_header] = default_cache_level
180+
# Mark the response as private or cache it, if it hasn't been marked as so already.
181+
if settings.ALLOW_PRIVATE_REPOS:
182+
private_response(response, force=False)
183+
else:
184+
cache_response(response, force=False)
186185

187186
def _set_request_attributes(self, request, unresolved_domain):
188187
"""
@@ -233,6 +232,10 @@ def process_request(self, request): # noqa
233232

234233
self._set_request_attributes(request, unresolved_domain)
235234

235+
response = self._get_https_redirect(request)
236+
if response:
237+
return response
238+
236239
# Remove multiple slashes from URL's
237240
if '//' in request.path:
238241
url_parsed = urlparse(request.get_full_path())
@@ -250,7 +253,9 @@ def process_request(self, request): # noqa
250253
from_url=request.get_full_path(),
251254
to_url=final_url,
252255
)
253-
return redirect(final_url)
256+
response = redirect(final_url)
257+
cache_response(response, cache_tags=[unresolved_domain.project.slug])
258+
return response
254259

255260
project = unresolved_domain.project
256261
log.debug(
@@ -286,6 +291,37 @@ def add_hosting_integrations_headers(self, request, response):
286291
if project.has_feature(Feature.HOSTING_INTEGRATIONS):
287292
response["X-RTD-Hosting-Integrations"] = "true"
288293

294+
def _get_https_redirect(self, request):
295+
"""
296+
Get a redirect response if the request should be redirected to HTTPS.
297+
298+
A request should be redirected to HTTPS if any of the following conditions are met:
299+
300+
- It's from a custom domain and the domain has HTTPS enabled.
301+
- It's from a public domain, and the public domain uses HTTPS.
302+
"""
303+
if request.is_secure():
304+
# The request is already HTTPS, so we skip redirecting it.
305+
return None
306+
307+
unresolved_domain = request.unresolved_domain
308+
309+
# HTTPS redirect for custom domains.
310+
if unresolved_domain.is_from_custom_domain:
311+
domain = unresolved_domain.domain
312+
if domain.https:
313+
return redirect_to_https(request, project=unresolved_domain.project)
314+
return None
315+
316+
# HTTPS redirect for public domains.
317+
if (
318+
unresolved_domain.is_from_public_domain
319+
or unresolved_domain.is_from_external_domain
320+
) and settings.PUBLIC_DOMAIN_USES_HTTPS:
321+
return redirect_to_https(request, project=unresolved_domain.project)
322+
323+
return None
324+
289325
def process_response(self, request, response): # noqa
290326
self.add_proxito_headers(request, response)
291327
self.add_cache_headers(request, response)

readthedocs/proxito/redirects.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from urllib.parse import urlparse
2+
3+
import structlog
4+
from django.http import HttpResponseRedirect
5+
6+
from readthedocs.core.resolver import resolver
7+
from readthedocs.core.utils.url import unsafe_join_url_path
8+
from readthedocs.proxito.cache import cache_response
9+
from readthedocs.proxito.constants import RedirectType
10+
from readthedocs.redirects.exceptions import InfiniteRedirectException
11+
12+
log = structlog.get_logger(__name__)
13+
14+
15+
def redirect_to_https(request, project):
16+
"""
17+
Shortcut to get a https redirect.
18+
19+
:param request: Request object.
20+
:param project: The current project being served
21+
"""
22+
return canonical_redirect(request, project, RedirectType.http_to_https)
23+
24+
25+
def canonical_redirect(request, project, redirect_type, external_version_slug=None):
26+
"""
27+
Return a canonical redirect response.
28+
29+
All redirects are cached, since the final URL will be checked for authorization.
30+
31+
The following cases are covered:
32+
33+
- Redirect a custom domain from http to https
34+
http://project.rtd.io/ -> https://project.rtd.io/
35+
- Redirect a domain to a canonical domain (http or https).
36+
http://project.rtd.io/ -> https://docs.test.com/
37+
http://project.rtd.io/foo/bar/ -> https://docs.test.com/foo/bar/
38+
- Redirect from a subproject domain to the main domain
39+
https://subproject.rtd.io/en/latest/foo -> https://main.rtd.io/projects/subproject/en/latest/foo # noqa
40+
https://subproject.rtd.io/en/latest/foo -> https://docs.test.com/projects/subproject/en/latest/foo # noqa
41+
42+
Raises ``InfiniteRedirectException`` if the redirect is the same as the current URL.
43+
44+
:param request: Request object.
45+
:param project: The current project being served
46+
:param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain)
47+
:param external_version_slug: The version slug if the request is from a pull request preview.
48+
"""
49+
from_url = request.build_absolute_uri()
50+
parsed_from = urlparse(from_url)
51+
52+
if redirect_type == RedirectType.http_to_https:
53+
# We only need to change the protocol.
54+
to = parsed_from._replace(scheme="https").geturl()
55+
elif redirect_type == RedirectType.to_canonical_domain:
56+
# We need to change the domain and protocol.
57+
canonical_domain = project.get_canonical_custom_domain()
58+
protocol = "https" if canonical_domain.https else "http"
59+
to = parsed_from._replace(
60+
scheme=protocol, netloc=canonical_domain.domain
61+
).geturl()
62+
elif redirect_type == RedirectType.subproject_to_main_domain:
63+
# We need to get the subproject root in the domain of the main
64+
# project, and append the current path.
65+
project_doc_prefix = resolver.get_url_prefix(
66+
project=project,
67+
external_version_slug=external_version_slug,
68+
)
69+
parsed_doc_prefix = urlparse(project_doc_prefix)
70+
to = parsed_doc_prefix._replace(
71+
path=unsafe_join_url_path(parsed_doc_prefix.path, parsed_from.path),
72+
query=parsed_from.query,
73+
).geturl()
74+
else:
75+
raise NotImplementedError
76+
77+
if from_url == to:
78+
# check that we do have a response and avoid infinite redirect
79+
log.warning(
80+
"Infinite Redirect: FROM URL is the same than TO URL.",
81+
url=to,
82+
)
83+
raise InfiniteRedirectException()
84+
85+
log.info(
86+
"Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to
87+
)
88+
resp = HttpResponseRedirect(to)
89+
resp["X-RTD-Redirect"] = redirect_type.name
90+
cache_response(resp, cache_tags=[project.slug])
91+
return resp

readthedocs/proxito/tests/test_full.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -1572,14 +1572,12 @@ def _test_cache_control_header_project(self, expected_value, host=None):
15721572
resp.headers["Cache-Tag"], "project,project:rtd-staticfiles,rtd-staticfiles"
15731573
)
15741574

1575-
# Slash redirect is done at the middleware level.
1576-
# So, it doesn't take into consideration the privacy level of the
1577-
# version, and always defaults to private.
1575+
# Slash redirects can always be cached.
15781576
url = '/en//latest//'
15791577
resp = self.client.get(url, secure=True, HTTP_HOST=host)
1580-
self.assertEqual(resp['Location'], '/en/latest/', url)
1581-
self.assertEqual(resp.headers['CDN-Cache-Control'], 'private', url)
1582-
self.assertNotIn('Cache-Tag', resp.headers, url)
1578+
self.assertEqual(resp["Location"], "/en/latest/", url)
1579+
self.assertEqual(resp.headers["CDN-Cache-Control"], "public", url)
1580+
self.assertEqual(resp.headers["Cache-Tag"], "project")
15831581

15841582
# Forced redirects will be cached only if the version is public.
15851583
get(
@@ -1637,14 +1635,12 @@ def _test_cache_control_header_subproject(self, expected_value, host=None):
16371635
resp.headers["Cache-Tag"], "project,project:rtd-staticfiles,rtd-staticfiles"
16381636
)
16391637

1640-
# Slash redirect is done at the middleware level.
1641-
# So, it doesn't take into consideration the privacy level of the
1642-
# version, and always defaults to private.
1638+
# Slash redirects can always be cached.
16431639
url = '/projects//subproject//'
16441640
resp = self.client.get(url, secure=True, HTTP_HOST=host)
1645-
self.assertEqual(resp['Location'], '/projects/subproject/', url)
1646-
self.assertEqual(resp.headers['CDN-Cache-Control'], 'private', url)
1647-
self.assertNotIn('Cache-Tag', resp.headers, url)
1641+
self.assertEqual(resp["Location"], "/projects/subproject/", url)
1642+
self.assertEqual(resp.headers["CDN-Cache-Control"], "public", url)
1643+
self.assertEqual(resp.headers["Cache-Tag"], "project")
16481644

16491645
def test_cache_on_private_versions(self):
16501646
self.project.versions.update(privacy_level=PRIVATE)

readthedocs/proxito/tests/test_headers.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -175,14 +175,18 @@ def test_cache_headers_robots_txt_with_private_projects_allowed(self):
175175

176176
@override_settings(ALLOW_PRIVATE_REPOS=False)
177177
def test_cache_headers_robots_txt_with_private_projects_not_allowed(self):
178-
r = self.client.get("/sitemap.xml", HTTP_HOST="project.dev.readthedocs.io")
178+
r = self.client.get(
179+
"/sitemap.xml", secure=True, HTTP_HOST="project.dev.readthedocs.io"
180+
)
179181
self.assertEqual(r.status_code, 200)
180182
self.assertEqual(r["CDN-Cache-Control"], "public")
181183
self.assertEqual(r["Cache-Tag"], "project,project:sitemap.xml")
182184

183185
@override_settings(ALLOW_PRIVATE_REPOS=True)
184186
def test_cache_headers_robots_txt_with_private_projects_allowed(self):
185-
r = self.client.get("/sitemap.xml", HTTP_HOST="project.dev.readthedocs.io")
187+
r = self.client.get(
188+
"/sitemap.xml", secure=True, HTTP_HOST="project.dev.readthedocs.io"
189+
)
186190
self.assertEqual(r.status_code, 200)
187191
self.assertEqual(r["CDN-Cache-Control"], "public")
188192
self.assertEqual(r["Cache-Tag"], "project,project:sitemap.xml")

readthedocs/proxito/tests/test_middleware.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ def run_middleware(self, request):
4545
def test_proper_cname(self):
4646
domain = 'docs.random.com'
4747
get(Domain, project=self.pip, domain=domain)
48-
request = self.request(method='get', path=self.url, HTTP_HOST=domain)
48+
request = self.request(
49+
method="get", secure=True, path=self.url, HTTP_HOST=domain
50+
)
4951
res = self.run_middleware(request)
5052
self.assertIsNone(res)
5153
self.assertTrue(request.unresolved_domain.is_from_custom_domain)

0 commit comments

Comments
 (0)