Skip to content

Commit cdb88ad

Browse files
committed
Proxito: handle http to https redirects for all requests
In order to be able to cache the redirects, I had to refactor some of our code (and kind of a little in the direction of #10124). Closes #10144
1 parent 6e15a2b commit cdb88ad

File tree

9 files changed

+231
-123
lines changed

9 files changed

+231
-123
lines changed

readthedocs/api/mixins.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from readthedocs.core.unresolver import UnresolverError, unresolve
88
from readthedocs.core.utils import get_cache_tag
99
from readthedocs.projects.models import Project
10+
from readthedocs.proxito.cache import add_cache_tags
1011

1112
log = structlog.get_logger(__name__)
1213

@@ -31,7 +32,7 @@ def dispatch(self, request, *args, **kwargs):
3132
response = super().dispatch(request, *args, **kwargs)
3233
cache_tags = self._get_cache_tags()
3334
if cache_tags:
34-
response["Cache-Tag"] = ",".join(cache_tags)
35+
add_cache_tags(response, cache_tags)
3536
return response
3637

3738
def _get_cache_tags(self):

readthedocs/core/mixins.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from django.contrib.auth.mixins import LoginRequiredMixin
33
from vanilla import ListView
44

5+
from readthedocs.proxito.cache import cache_response, private_response
6+
57

68
class ListViewWithForm(ListView):
79

@@ -53,11 +55,12 @@ class CDNCacheControlMixin:
5355

5456
def dispatch(self, request, *args, **kwargs):
5557
response = super().dispatch(request, *args, **kwargs)
56-
cache_response = self.can_be_cached(request)
57-
if cache_response is not None:
58-
response.headers["CDN-Cache-Control"] = (
59-
"public" if cache_response else "private"
60-
)
58+
can_be_cached = self.can_be_cached(request)
59+
if can_be_cached is not None:
60+
if can_be_cached:
61+
cache_response(response)
62+
else:
63+
private_response(response)
6164
return response
6265

6366
def can_be_cached(self, request):

readthedocs/proxito/cache.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import structlog
2+
3+
log = structlog.get_logger(__name__)
4+
5+
CACHE_TAG_HEADER = "Cache-Tag"
6+
CDN_CACHE_CONTROL_HEADER = "CDN-Cache-Control"
7+
8+
9+
def add_cache_tags(response, cache_tags):
10+
"""
11+
Add cache tags to the response.
12+
13+
New cache tags will be appended to the existing ones.
14+
15+
:param response: The response to add cache tags to.
16+
:param cache_tags: A list of cache tags to add to the response.
17+
"""
18+
cache_tags = cache_tags.copy()
19+
current_cache_tag = response.headers.get(CACHE_TAG_HEADER)
20+
if current_cache_tag:
21+
cache_tags.append(current_cache_tag)
22+
23+
response.headers[CACHE_TAG_HEADER] = ",".join(cache_tags)
24+
25+
26+
def cache_response(response, cache_tags=None, force=True):
27+
"""
28+
Cache the response at the CDN level.
29+
30+
We add the ``Cache-Tag`` header to the response, to be able to purge
31+
the cache by a given tag. And we set the ``CDN-Cache-Control`` header
32+
to public, to cache the response at the CDN level only. This doesn't
33+
affect caching at the browser level (``Cache-Control``).
34+
35+
See:
36+
37+
- https://developers.cloudflare.com/cache/how-to/purge-cache/#cache-tags-enterprise-only
38+
- https://developers.cloudflare.com/cache/about/cdn-cache-control.
39+
40+
:param response: The response to cache.
41+
:param cache_tags: A list of cache tags to add to the response.
42+
:param force: If ``True``, the header will be set to public even if it
43+
was already set to private.
44+
"""
45+
if cache_tags:
46+
add_cache_tags(response, cache_tags)
47+
if force or CDN_CACHE_CONTROL_HEADER not in response.headers:
48+
if not response.headers.get(CACHE_TAG_HEADER):
49+
log.warning("Caching response without cache tags.")
50+
51+
response.headers[CDN_CACHE_CONTROL_HEADER] = "public"
52+
53+
54+
def private_response(response, force=True):
55+
"""
56+
Prevent the response from being cached at the CDN level.
57+
58+
We do this by explicitly setting the ``CDN-Cache-Control`` header to private.
59+
60+
:param response: The response to mark as private.
61+
:param force: If ``True``, the header will be set to private even if it
62+
was already set to public.
63+
"""
64+
if force or CDN_CACHE_CONTROL_HEADER not in response.headers:
65+
response.headers[CDN_CACHE_CONTROL_HEADER] = "private"

readthedocs/proxito/middleware.py

+44-11
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
)
2828
from readthedocs.core.utils import get_cache_tag
2929
from readthedocs.projects.models import Feature, Project
30+
from readthedocs.proxito.cache import add_cache_tags, cache_response, private_response
31+
from readthedocs.proxito.redirects import redirect_to_https
3032

3133
log = structlog.get_logger(__name__)
3234

@@ -63,15 +65,14 @@ def add_proxito_headers(self, request, response):
6365
response['X-RTD-Path'] = path
6466

6567
# Include the project & project-version so we can do larger purges if needed
66-
cache_tag = response.get('Cache-Tag')
67-
cache_tags = [cache_tag] if cache_tag else []
68+
cache_tags = []
6869
if project_slug:
6970
cache_tags.append(project_slug)
7071
if version_slug:
7172
cache_tags.append(get_cache_tag(project_slug, version_slug))
7273

7374
if cache_tags:
74-
response['Cache-Tag'] = ','.join(cache_tags)
75+
add_cache_tags(response, cache_tags)
7576

7677
unresolved_domain = request.unresolved_domain
7778
if unresolved_domain:
@@ -167,22 +168,20 @@ def add_cache_headers(self, request, response):
167168
168169
See https://developers.cloudflare.com/cache/about/cdn-cache-control.
169170
"""
170-
cdn_cache_header = "CDN-Cache-Control"
171171
unresolved_domain = request.unresolved_domain
172172
# Never trust projects resolving from the X-RTD-Slug header,
173173
# we don't want to cache their content on domains from other
174174
# projects, see GHSA-mp38-vprc-7hf5.
175175
if unresolved_domain and unresolved_domain.is_from_http_header:
176-
response.headers[cdn_cache_header] = "private"
176+
private_response(response, force=True)
177177
# SECURITY: Return early, we never want to cache this response.
178178
return
179179

180-
# Set the key only if it hasn't already been set by the view.
181-
if cdn_cache_header not in response.headers:
182-
default_cache_level = (
183-
"private" if settings.ALLOW_PRIVATE_REPOS else "public"
184-
)
185-
response.headers[cdn_cache_header] = default_cache_level
180+
# Mark the response as private or cache it, if it hasn't been marked as so already.
181+
if settings.ALLOW_PRIVATE_REPOS:
182+
private_response(response, force=False)
183+
else:
184+
cache_response(response, force=False)
186185

187186
def _set_request_attributes(self, request, unresolved_domain):
188187
"""
@@ -233,6 +232,10 @@ def process_request(self, request): # noqa
233232

234233
self._set_request_attributes(request, unresolved_domain)
235234

235+
response = self._get_https_redirect(request)
236+
if response:
237+
return response
238+
236239
# Remove multiple slashes from URL's
237240
if '//' in request.path:
238241
url_parsed = urlparse(request.get_full_path())
@@ -286,6 +289,36 @@ def add_hosting_integrations_headers(self, request, response):
286289
if project.has_feature(Feature.HOSTING_INTEGRATIONS):
287290
response["X-RTD-Hosting-Integrations"] = "true"
288291

292+
def _get_https_redirect(self, request):
293+
"""
294+
Get a redirect response if the request should be redirected to HTTPS.
295+
296+
A request should be redirected to HTTPS if any of the following conditions are met:
297+
298+
- It's from a custom domain and the domain has HTTPS enabled.
299+
- It's from a public domain, and the public domain uses HTTPS.
300+
"""
301+
if request.is_secure():
302+
return None
303+
304+
unresolved_domain = request.unresolved_domain
305+
306+
# HTTPS redirect for custom domains.
307+
if unresolved_domain.is_from_custom_domain:
308+
domain = unresolved_domain.domain
309+
if domain.https:
310+
return redirect_to_https(request, project=unresolved_domain.project)
311+
return None
312+
313+
# HTTPS redirect for public domains.
314+
if (
315+
unresolved_domain.is_from_public_domain
316+
or unresolved_domain.is_from_external_domain
317+
) and settings.PUBLIC_DOMAIN_USES_HTTPS:
318+
return redirect_to_https(request, project=unresolved_domain.project)
319+
320+
return None
321+
289322
def process_response(self, request, response): # noqa
290323
self.add_proxito_headers(request, response)
291324
self.add_cache_headers(request, response)

readthedocs/proxito/redirects.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from urllib.parse import urlparse
2+
3+
import structlog
4+
from django.http import HttpResponseRedirect
5+
6+
from readthedocs.core.resolver import resolver
7+
from readthedocs.core.utils.url import unsafe_join_url_path
8+
from readthedocs.proxito.cache import cache_response
9+
from readthedocs.proxito.constants import RedirectType
10+
from readthedocs.redirects.exceptions import InfiniteRedirectException
11+
12+
log = structlog.get_logger(__name__)
13+
14+
15+
def redirect_to_https(request, project):
16+
"""
17+
Shortcut to get a https redirect.
18+
19+
:param request: Request object.
20+
:param project: The current project being served
21+
"""
22+
return canonical_redirect(request, project, RedirectType.http_to_https)
23+
24+
25+
def canonical_redirect(request, project, redirect_type, external_version_slug=None):
26+
"""
27+
Return a canonical redirect response.
28+
29+
All redirects are cached, since the final URL will be checked for authorization.
30+
31+
The following cases are covered:
32+
33+
- Redirect a custom domain from http to https
34+
http://project.rtd.io/ -> https://project.rtd.io/
35+
- Redirect a domain to a canonical domain (http or https).
36+
http://project.rtd.io/ -> https://docs.test.com/
37+
http://project.rtd.io/foo/bar/ -> https://docs.test.com/foo/bar/
38+
- Redirect from a subproject domain to the main domain
39+
https://subproject.rtd.io/en/latest/foo -> https://main.rtd.io/projects/subproject/en/latest/foo # noqa
40+
https://subproject.rtd.io/en/latest/foo -> https://docs.test.com/projects/subproject/en/latest/foo # noqa
41+
42+
Raises ``InfiniteRedirectException`` if the redirect is the same as the current URL.
43+
44+
:param request: Request object.
45+
:param project: The current project being served
46+
:param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain)
47+
:param external_version_slug: The version slug if the request is from a pull request preview.
48+
"""
49+
from_url = request.build_absolute_uri()
50+
parsed_from = urlparse(from_url)
51+
52+
if redirect_type == RedirectType.http_to_https:
53+
# We only need to change the protocol.
54+
to = parsed_from._replace(scheme="https").geturl()
55+
elif redirect_type == RedirectType.to_canonical_domain:
56+
# We need to change the domain and protocol.
57+
canonical_domain = project.get_canonical_custom_domain()
58+
protocol = "https" if canonical_domain.https else "http"
59+
to = parsed_from._replace(
60+
scheme=protocol, netloc=canonical_domain.domain
61+
).geturl()
62+
elif redirect_type == RedirectType.subproject_to_main_domain:
63+
# We need to get the subproject root in the domain of the main
64+
# project, and append the current path.
65+
project_doc_prefix = resolver.get_url_prefix(
66+
project=project,
67+
external_version_slug=external_version_slug,
68+
)
69+
parsed_doc_prefix = urlparse(project_doc_prefix)
70+
to = parsed_doc_prefix._replace(
71+
path=unsafe_join_url_path(parsed_doc_prefix.path, parsed_from.path),
72+
query=parsed_from.query,
73+
).geturl()
74+
else:
75+
raise NotImplementedError
76+
77+
if from_url == to:
78+
# check that we do have a response and avoid infinite redirect
79+
log.warning(
80+
"Infinite Redirect: FROM URL is the same than TO URL.",
81+
url=to,
82+
)
83+
raise InfiniteRedirectException()
84+
85+
log.info(
86+
"Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to
87+
)
88+
resp = HttpResponseRedirect(to)
89+
resp["X-RTD-Redirect"] = redirect_type.name
90+
cache_response(resp, cache_tags=[project.slug])
91+
return resp

readthedocs/proxito/tests/test_headers.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -173,14 +173,18 @@ def test_cache_headers_robots_txt_with_private_projects_allowed(self):
173173

174174
@override_settings(ALLOW_PRIVATE_REPOS=False)
175175
def test_cache_headers_robots_txt_with_private_projects_not_allowed(self):
176-
r = self.client.get("/sitemap.xml", HTTP_HOST="project.dev.readthedocs.io")
176+
r = self.client.get(
177+
"/sitemap.xml", secure=True, HTTP_HOST="project.dev.readthedocs.io"
178+
)
177179
self.assertEqual(r.status_code, 200)
178180
self.assertEqual(r["CDN-Cache-Control"], "public")
179181
self.assertEqual(r["Cache-Tag"], "project,project:sitemap.xml")
180182

181183
@override_settings(ALLOW_PRIVATE_REPOS=True)
182184
def test_cache_headers_robots_txt_with_private_projects_allowed(self):
183-
r = self.client.get("/sitemap.xml", HTTP_HOST="project.dev.readthedocs.io")
185+
r = self.client.get(
186+
"/sitemap.xml", secure=True, HTTP_HOST="project.dev.readthedocs.io"
187+
)
184188
self.assertEqual(r.status_code, 200)
185189
self.assertEqual(r["CDN-Cache-Control"], "public")
186190
self.assertEqual(r["Cache-Tag"], "project,project:sitemap.xml")

readthedocs/proxito/tests/test_redirects.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,9 @@ def test_translation_secure_redirect(self):
229229

230230
# Specific Page Redirects
231231
def test_proper_page_on_subdomain(self):
232-
r = self.client.get('/page/test.html', HTTP_HOST='project.dev.readthedocs.io')
232+
r = self.client.get(
233+
"/page/test.html", secure=True, HTTP_HOST="project.dev.readthedocs.io"
234+
)
233235
self.assertEqual(r.status_code, 302)
234236
self.assertEqual(
235237
r['Location'],
@@ -240,43 +242,43 @@ def test_slash_redirect(self):
240242
host = 'project.dev.readthedocs.io'
241243

242244
url = '/en/latest////awesome.html'
243-
resp = self.client.get(url, HTTP_HOST=host)
245+
resp = self.client.get(url, secure=True, HTTP_HOST=host)
244246
self.assertEqual(resp.status_code, 302)
245247
self.assertEqual(
246248
resp['Location'], '/en/latest/awesome.html',
247249
)
248250

249251
url = '/en/latest////awesome.html'
250-
resp = self.client.get(url, HTTP_HOST=host)
252+
resp = self.client.get(url, secure=True, HTTP_HOST=host)
251253
self.assertEqual(resp.status_code, 302)
252254
self.assertEqual(
253255
resp['Location'], '/en/latest/awesome.html',
254256
)
255257

256258
url = '/en/latest////awesome///index.html'
257-
resp = self.client.get(url, HTTP_HOST=host)
259+
resp = self.client.get(url, secure=True, HTTP_HOST=host)
258260
self.assertEqual(resp.status_code, 302)
259261
self.assertEqual(
260262
resp['Location'], '/en/latest/awesome/index.html',
261263
)
262264

263265
url = '/en/latest////awesome///index.html?foo=bar'
264-
resp = self.client.get(url, HTTP_HOST=host)
266+
resp = self.client.get(url, secure=True, HTTP_HOST=host)
265267
self.assertEqual(resp.status_code, 302)
266268
self.assertEqual(
267269
resp['Location'], '/en/latest/awesome/index.html?foo=bar',
268270
)
269271

270272
url = '/en/latest////awesome///'
271-
resp = self.client.get(url, HTTP_HOST=host)
273+
resp = self.client.get(url, secure=True, HTTP_HOST=host)
272274
self.assertEqual(resp.status_code, 302)
273275
self.assertEqual(
274276
resp['Location'], '/en/latest/awesome/',
275277
)
276278

277279
# Don't change the values of params
278280
url = '/en/latest////awesome///index.html?foo=bar//bas'
279-
resp = self.client.get(url, HTTP_HOST=host)
281+
resp = self.client.get(url, secure=True, HTTP_HOST=host)
280282
self.assertEqual(resp.status_code, 302)
281283
self.assertEqual(
282284
resp['Location'], '/en/latest/awesome/index.html?foo=bar//bas',

0 commit comments

Comments
 (0)