Skip to content

New unresolver implementation #9500

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 23, 2022
Merged
9 changes: 4 additions & 5 deletions readthedocs/analytics/proxied_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from readthedocs.analytics.models import PageView
from readthedocs.api.v2.permissions import IsAuthorizedToViewVersion
from readthedocs.core.unresolver import unresolve_from_request
from readthedocs.core.unresolver import unresolve
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.projects.models import Project

Expand Down Expand Up @@ -50,18 +50,17 @@ def get(self, request, *args, **kwargs):
version = self._get_version()
absolute_uri = self.request.GET.get('absolute_uri')
self.increase_page_view_count(
request=request,
project=project,
version=version,
absolute_uri=absolute_uri,
)
return Response(status=200)

# pylint: disable=no-self-use
def increase_page_view_count(self, request, project, version, absolute_uri):
def increase_page_view_count(self, project, version, absolute_uri):
"""Increase the page view count for the given project."""
unresolved = unresolve_from_request(request, absolute_uri)
if not unresolved:
unresolved = unresolve(absolute_uri)
if not unresolved or not unresolved.filename:
return

path = unresolved.filename
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/api/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def _get_version(self):
return

if self.unresolved_url:
version_slug = self.unresolved_url.version_slug
version_slug = self.unresolved_url.version.slug
else:
version_slug = self.request.GET.get("version", "latest")
project = self._get_project()
Expand Down
300 changes: 240 additions & 60 deletions readthedocs/core/unresolver.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,279 @@
import structlog
from collections import namedtuple
import re
from dataclasses import dataclass
from urllib.parse import urlparse

from django.test.client import RequestFactory
from django.urls import resolve as url_resolve
import structlog
from django.conf import settings

from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.proxito.middleware import map_host_to_project_slug
from readthedocs.proxito.views.mixins import ServeDocsMixin
from readthedocs.proxito.views.utils import _get_project_data_from_request
from readthedocs.builds.models import Version
from readthedocs.constants import pattern_opts
from readthedocs.projects.models import Domain, Project

log = structlog.get_logger(__name__)

UnresolvedObject = namedtuple(
'Unresolved', 'project, language_slug, version_slug, filename, fragment')

@dataclass(slots=True)
class UnresolvedURL:

"""Dataclass with the parts of an unresolved URL."""

# This is the project that owns the domain,
# this usually is the parent project of a translation or subproject.
canonical_project: Project

# The current project we are serving the docs from.
# It can be the same as canonical_project.
project: Project

version: Version = None
filename: str = None
query: str = None
fragment: str = None
domain: Domain = None
external: bool = False


class UnresolverBase:
class Unresolver:
# This pattern matches:
# - /en
# - /en/
# - /en/latest
# - /en/latest/
# - /en/latest/file/name/
multiversion_pattern = re.compile(
r"^/(?P<language>{lang_slug})(/((?P<version>{version_slug})(/(?P<file>{filename_slug}))?)?)?$".format( # noqa
**pattern_opts
)
)

# This pattern matches:
# - /projects/subproject
# - /projects/subproject/
# - /projects/subproject/file/name/
subproject_pattern = re.compile(
r"^/projects/(?P<project>{project_slug}+)(/(?P<file>{filename_slug}))?$".format(
**pattern_opts
)
)

def unresolve(self, url):
def unresolve(self, url, add_index=True):
"""
Turn a URL into the component parts that our views would use to process them.

This is useful for lots of places,
like where we want to figure out exactly what file a URL maps to.

:param url: Full URL to unresolve (including the protocol and domain part).
:param add_index: If `True` the filename will be normalized
to end with ``/index.html``.
"""
parsed = urlparse(url)
domain = parsed.netloc.split(':', 1)[0]

# TODO: Make this not depend on the request object,
# but instead move all this logic here working on strings.
request = RequestFactory().get(path=parsed.path, HTTP_HOST=domain)
project_slug = map_host_to_project_slug(request)
domain = self.get_domain_from_host(parsed.netloc)
project_slug, domain_object, external = self.unresolve_domain(domain)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are using project_slug to refer to the parent_project_slug here. Below, we have parent_project and project too.

I think it would be good to refactor this to use more descriptive names:

Probably makes sense to change the UnresolvedURL to use current_project as well so we keep consistency all across our code.

if not project_slug:
return None

# Handle returning a response
if hasattr(project_slug, 'status_code'):
canonical_project = Project.objects.filter(slug=project_slug).first()
if not canonical_project:
return None

request.host_project_slug = request.slug = project_slug
return self.unresolve_from_request(request, url)
project, version, filename = self._unresolve_path(
canonical_project=canonical_project,
path=parsed.path,
)

if add_index and filename and filename.endswith("/"):
filename += "index.html"

def unresolve_from_request(self, request, path):
return UnresolvedURL(
canonical_project=canonical_project,
project=project or canonical_project,
version=version,
filename=filename,
query=parsed.query,
fragment=parsed.fragment,
domain=domain_object,
external=external,
)

@staticmethod
def _normalize_filename(filename):
"""Normalize filename to always start with ``/``."""
filename = filename or "/"
if not filename.startswith("/"):
filename = "/" + filename
return filename

def _match_multiversion_project(self, canonical_project, path):
"""
Unresolve using a request.
Try to match a multiversion project.

``path`` can be a full URL, but the domain will be ignored,
since that information is already in the request object.
If the translation exists, we return a result even if the version doesn't,
so the translation is taken as the canonical project (useful for 404 pages).
"""
match = self.multiversion_pattern.match(path)
if not match:
return None

language = match.group("language")
version_slug = match.group("version")
file = self._normalize_filename(match.group("file"))

if canonical_project.language == language:
project = canonical_project
else:
project = canonical_project.translations.filter(language=language).first()

None is returned if the request isn't valid.
if project:
version = project.versions.filter(slug=version_slug).first()
if version:
return project, version, file
return project, None, None

return None

def _match_subproject(self, canonical_project, path):
"""
parsed = urlparse(path)
path = parsed.path
project_slug = getattr(request, 'host_project_slug', None)
Try to match a subproject.

if not project_slug:
If the subproject exists, we try to resolve the rest of the path
with the subproject as the canonical project.
"""
match = self.subproject_pattern.match(path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
match = self.subproject_pattern.match(path)
# TODO: Allow the canonical_project to override this url somehow..
# if canonical_project.subproject_pattern:
# match = canonical_project.subproject_pattern.match(path)
match = self.subproject_pattern.match(path)

Is this the longer term goal for customization?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, basically on each call to match, will be some extra logic to use the custom pattern from the project instead of the default one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same idea should be used for multiversion patterns 👍🏼

if not match:
return None

_, __, kwargs = url_resolve(
path,
urlconf='readthedocs.proxito.urls',
project_slug = match.group("project")
file = self._normalize_filename(match.group("file"))
project_relationship = (
canonical_project.subprojects.filter(alias=project_slug)
.prefetch_related("child")
.first()
)
if project_relationship:
return self._unresolve_path(
canonical_project=project_relationship.child,
path=file,
check_subprojects=False,
)
return None

mixin = ServeDocsMixin()
version_slug = mixin.get_version_from_host(request, kwargs.get('version_slug'))
def _match_single_version_project(self, canonical_project, path):
"""
Try to match a single version project.

final_project, lang_slug, version_slug, filename = _get_project_data_from_request( # noqa
request,
project_slug=project_slug,
subproject_slug=kwargs.get('subproject_slug'),
lang_slug=kwargs.get('lang_slug'),
version_slug=version_slug,
filename=kwargs.get('filename', ''),
)
By default any path will match.
"""
file = self._normalize_filename(path)
version = canonical_project.versions.filter(
slug=canonical_project.default_version
).first()
if version:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what case will this be False? I think when they have a default_version that doesn't exist?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, this is when a use doesn't have a default version or the default version is invalid

return canonical_project, version, file
return canonical_project, None, None

# Handle our backend storage not supporting directory indexes,
# so we need to append index.html when appropriate.
if not filename or filename.endswith('/'):
# We need to add the index.html to find this actual file
filename += 'index.html'

log.debug(
'Unresolver parsed.',
project_slug=final_project.slug,
lang_slug=lang_slug,
version_slug=version_slug,
filename=filename,
def _unresolve_path(self, canonical_project, path, check_subprojects=True):
"""
Unresolve `path` with `canonical_project` as base.

If the returned project is `None`, then we weren't able to
unresolve the path into a project.

If the returned version is `None`, then we weren't able to
unresolve the path into a valid version of the project.

:param canonical_project: The project that owns the path.
:param path: The path to unresolve.
:param check_subprojects: If we should check for subprojects,
this is used to call this function recursively.

:returns: A tuple with: project, version, and file name.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to make this a dataclass or similar here as well and all these related methods.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is okay to have this as a tuple, they are internal methods, and we already have one dataclass for the final result.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if they are internal methods, real objects are a lot easier to read and follow while debugging/reading this code.

"""
# Multiversion project.
if not canonical_project.single_version:
response = self._match_multiversion_project(
canonical_project=canonical_project,
path=path,
)
if response:
return response

# Subprojects.
if check_subprojects:
response = self._match_subproject(
canonical_project=canonical_project,
path=path,
)
if response:
return response

# Single version project.
if canonical_project.single_version:
response = self._match_single_version_project(
canonical_project=canonical_project,
path=path,
)
if response:
return response

return None, None, None

@staticmethod
def get_domain_from_host(host):
"""
Get the normalized domain from a hostname.

A hostname can include the port.
"""
return host.lower().split(":")[0]

# TODO: make this a private method once
# proxito uses the unresolve method directly.
def unresolve_domain(self, domain):
"""
Unresolve domain by extracting relevant information from it.

:param str domain: Domain to extract the information from.
:returns: A tuple with: the project slug, domain object, and if the domain
is from an external version.
"""
public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN)
external_domain = self.get_domain_from_host(
settings.RTD_EXTERNAL_VERSION_DOMAIN
)
return UnresolvedObject(final_project, lang_slug, version_slug, filename, parsed.fragment)

subdomain, *rest_of_domain = domain.split(".", maxsplit=1)
rest_of_domain = rest_of_domain[0] if rest_of_domain else ""

if public_domain in domain:
# Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`.
if public_domain == rest_of_domain:
project_slug = subdomain
return project_slug, None, False

class Unresolver(SettingsOverrideObject):
# TODO: This can catch some possibly valid domains (docs.readthedocs.io.com)
# for example, but these might be phishing, so let's ignore them for now.
return None, None, False

if external_domain in domain:
# Serve custom versions on external-host-domain.
if external_domain == rest_of_domain:
try:
project_slug, _ = subdomain.rsplit("--", maxsplit=1)
return project_slug, None, True
except ValueError:
return None, None, False

# Custom domain.
domain_object = (
Domain.objects.filter(domain=domain).prefetch_related("project").first()
)
if domain_object:
project_slug = domain_object.project.slug
return project_slug, domain_object, False

_default_class = UnresolverBase
return None, None, None


unresolver = Unresolver()
unresolve = unresolver.unresolve
unresolve_from_request = unresolver.unresolve_from_request
Loading