Skip to content

Unresolver: strict validation for external versions and other fixes #9534

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Sep 26, 2022
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 50 additions & 20 deletions readthedocs/core/unresolver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from dataclasses import dataclass
from urllib.parse import urlparse
from urllib.parse import ParseResult, urlparse

import structlog
from django.conf import settings
Expand All @@ -27,8 +27,7 @@ class UnresolvedURL:

version: Version = None
filename: str = None
query: str = None
fragment: str = None
parsed_url: ParseResult = None
domain: Domain = None
external: bool = False

Expand Down Expand Up @@ -69,7 +68,9 @@ def unresolve(self, url, add_index=True):
"""
parsed = urlparse(url)
domain = self.get_domain_from_host(parsed.netloc)
project_slug, domain_object, external = self.unresolve_domain(domain)
project_slug, domain_object, external_version_slug = self.unresolve_domain(
domain
)
if not project_slug:
return None

Expand All @@ -82,6 +83,26 @@ def unresolve(self, url, add_index=True):
path=parsed.path,
)

# Make sure we are serving the external version from the subdomain.
if external_version_slug and version:
if external_version_slug != version.slug:
log.warning(
"Invalid version for external domain.",
domain=domain,
version_slug=version.slug,
)
version = None
filename = None
Comment on lines +105 to +106
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do something more than set these to None here? Is raising an exception better, rather than having the caller need to know what None means on these fields?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting version to None will be interpreted as like the version wasn't found, but we could return None, or raise an exception, since those URLs will be generated by manually changing the URL.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about this as well, and I'm not sure what's my position here.

I'd like to raise exceptions for all these cases were we return None for not found things. However, I'm not sure it's possible for all the cases because IIRC there are other places were project, None, None is useful to continue moving forward.

If that's not the case and I'm confused here, I'm 👍🏼 on standardizing this and always raise a detailed exception for each case that we can clearly handle from outside.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, basically this would be interpreted as a 404, like if the version wouldn't exist.

elif not version.is_external:
log.warning(
"Version is not external.",
domain=domain,
version_slug=version.slug,
version_type=version.type,
)
version = None
filename = None

if add_index and filename and filename.endswith("/"):
filename += "index.html"

Expand All @@ -90,10 +111,9 @@ def unresolve(self, url, add_index=True):
project=project or parent_project,
version=version,
filename=filename,
query=parsed.query,
fragment=parsed.fragment,
parsed_url=parsed,
domain=domain_object,
external=external,
external=bool(external_version_slug),
)

@staticmethod
Expand All @@ -109,7 +129,7 @@ def _match_multiversion_project(self, parent_project, path):
Try to match a multiversion project.

If the translation exists, we return a result even if the version doesn't,
so the translation is taken as the canonical project (useful for 404 pages).
so the translation is taken as the current project (useful for 404 pages).
"""
match = self.multiversion_pattern.match(path)
if not match:
Expand Down Expand Up @@ -138,6 +158,9 @@ def _match_subproject(self, parent_project, path):

If the subproject exists, we try to resolve the rest of the path
with the subproject as the canonical project.

If the subproject exists, we return a result even if version doesn't,
so the subproject is taken as the current project (useful for 404 pages).
"""
match = self.subproject_pattern.match(path)
if not match:
Expand All @@ -151,11 +174,18 @@ def _match_subproject(self, parent_project, path):
.first()
)
if project_relationship:
return self._unresolve_path(
parent_project=project_relationship.child,
subproject = project_relationship.child
response = self._unresolve_path(
parent_project=subproject,
path=file,
check_subprojects=False,
)
# If we got a valid response, return that,
# otherwise return the current subproject
# as the current project without a valid version or path.
if response:
return response
return subproject, None, None
return None

def _match_single_version_project(self, parent_project, path):
Expand Down Expand Up @@ -216,7 +246,7 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True):
if response:
return response

return None, None, None
return parent_project, None, None

@staticmethod
def get_domain_from_host(host):
Expand All @@ -234,8 +264,8 @@ def unresolve_domain(self, domain):
Unresolve domain by extracting relevant information from it.

:param str domain: Domain to extract the information from.
:returns: A tuple with: the project slug, domain object, and if the domain
is from an external version.
:returns: A tuple with: the project slug, domain object, and the
external version slug if the domain is from an external version.
"""
public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN)
external_domain = self.get_domain_from_host(
Expand All @@ -249,23 +279,23 @@ def unresolve_domain(self, domain):
# Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`.
if public_domain == root_domain:
project_slug = subdomain
log.debug("Public domain.", domain=domain)
return project_slug, None, False
log.info("Public domain.", domain=domain)
return project_slug, None, None

# TODO: This can catch some possibly valid domains (docs.readthedocs.io.com)
# for example, but these might be phishing, so let's ignore them for now.
log.warning("Weird variation of our domain.", domain=domain)
return None, None, False
return None, None, None

# Serve PR builds on external_domain host.
if external_domain == root_domain:
try:
log.debug("External versions domain.", domain=domain)
project_slug, _ = subdomain.rsplit("--", maxsplit=1)
return project_slug, None, True
project_slug, version_slug = subdomain.rsplit("--", maxsplit=1)
log.info("External versions domain.", domain=domain)
return project_slug, None, version_slug
except ValueError:
log.info("Invalid format of external versions domain.", domain=domain)
return None, None, False
return None, None, None

# Custom domain.
domain_object = (
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/embed/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get(self, request):
if url:
unresolved = self.unresolved_url
path = unresolved.filename
section = unresolved.fragment
section = unresolved.parsed_url.fragment
elif not path and not doc:
return Response(
{
Expand Down
10 changes: 5 additions & 5 deletions readthedocs/proxito/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug)
return project_slug

project_slug, domain_object, external = unresolver.unresolve_domain(host)
project_slug, domain_object, external_version_slug = unresolver.unresolve_domain(
host
)
if not project_slug:
# Block domains that look like ours, may be phishing.
if external_domain in host or public_domain in host:
Expand Down Expand Up @@ -85,11 +87,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
return project_slug

# Pull request previews.
if external:
subdomain, _ = host.split(".", maxsplit=1)
_, version_slug = subdomain.rsplit("--", maxsplit=1)
if external_version_slug:
request.external_domain = True
request.host_version_slug = version_slug
request.host_version_slug = external_version_slug
log.debug("Proxito External Version Domain.", host=host)
return project_slug

Expand Down
36 changes: 34 additions & 2 deletions readthedocs/rtd_tests/tests/test_unresolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def test_unresolver(self):
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, self.version)
self.assertEqual(parts.filename, "/foo.html")
self.assertEqual(parts.fragment, "fragment")
self.assertEqual(parts.query, "search=api")
self.assertEqual(parts.parsed_url.fragment, "fragment")
self.assertEqual(parts.parsed_url.query, "search=api")

def test_filename_wihtout_index(self):
parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False)
Expand Down Expand Up @@ -108,6 +108,20 @@ def test_unresolve_subproject_alias(self):
self.assertEqual(parts.version, self.subproject_version)
self.assertEqual(parts.filename, "/index.html")

def test_unresolve_subproject_invalid_version(self):
parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/nothing/foo.html")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.subproject)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_unresolve_subproject_invalid_translation(self):
parts = unresolve("https://pip.readthedocs.io/projects/sub/es/latest/foo.html")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.subproject)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_unresolver_translation(self):
parts = unresolve("https://pip.readthedocs.io/ja/latest/foo.html")
self.assertEqual(parts.parent_project, self.pip)
Expand Down Expand Up @@ -196,6 +210,24 @@ def test_unresolve_external_version_no_existing_version(self):
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_external_version_not_matching_final_version(self):
parts = unresolve("https://pip--10.dev.readthedocs.build/en/latest/")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_normal_version_in_external_version_subdomain(self):
parts = unresolve("https://pip--latest.dev.readthedocs.build/en/latest/")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_malformed_external_version(self):
parts = unresolve("https://pip-latest.dev.readthedocs.build/en/latest/")
self.assertEqual(parts, None)

def test_unresolver_unknown_host(self):
parts = unresolve("https://random.stuff.com/en/latest/")
self.assertEqual(parts, None)