From e34fe11cf31ccc891d3e36baad32a254516fc4d0 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 1 Aug 2022 10:38:08 -0500 Subject: [PATCH 01/21] Move code --- readthedocs/proxito/middleware.py | 91 +++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 2e7954fbf93..624fc6c046c 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -22,6 +22,97 @@ log = structlog.get_logger(__name__) # noqa +def _unresolve_domain(domain): + """ + Unresolve domain. + + We extract the project slug from the domain. + """ + host = request.get_host().lower().split(':')[0] + public_domain = settings.PUBLIC_DOMAIN.lower().split(':')[0] + external_domain = settings.RTD_EXTERNAL_VERSION_DOMAIN.lower().split(':')[0] + + host_parts = host.split('.') + public_domain_parts = public_domain.split('.') + external_domain_parts = external_domain.split('.') + + project_slug = None + + # Explicit Project slug being passed in + if 'HTTP_X_RTD_SLUG' in request.META: + project_slug = request.META['HTTP_X_RTD_SLUG'].lower() + if Project.objects.filter(slug=project_slug).exists(): + request.rtdheader = True + log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug) + return project_slug + + if public_domain in host or host == 'proxito': + # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN` + if public_domain_parts == host_parts[1:]: + project_slug = host_parts[0] + request.subdomain = True + log.debug('Proxito Public Domain.', host=host) + if Domain.objects.filter(project__slug=project_slug).filter( + canonical=True, + https=True, + ).exists(): + log.debug('Proxito Public Domain -> Canonical Domain Redirect.', host=host) + request.canonicalize = constants.REDIRECT_CANONICAL_CNAME + elif ( + ProjectRelationship.objects. + filter(child__slug=project_slug).exists() + ): + log.debug('Proxito Public Domain -> Subproject Main Domain Redirect.', host=host) + request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN + return project_slug + + # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example + # But these feel like they might be phishing, etc. so let's block them for now. + log.warning('Weird variation on our hostname.', host=host) + return render( + request, 'core/dns-404.html', context={'host': host}, status=400 + ) + + if external_domain in host: + # Serve custom versions on external-host-domain + if external_domain_parts == host_parts[1:]: + try: + project_slug, version_slug = host_parts[0].rsplit('--', 1) + request.external_domain = True + request.host_version_slug = version_slug + log.debug('Proxito External Version Domain.', host=host) + return project_slug + except ValueError: + log.warning('Weird variation on our hostname.', host=host) + return render( + request, 'core/dns-404.html', context={'host': host}, status=400 + ) + + # Serve CNAMEs + domain = Domain.objects.filter(domain=host).first() + if domain: + project_slug = domain.project.slug + request.cname = True + request.domain = domain + log.debug('Proxito CNAME.', host=host) + + if domain.https and not request.is_secure(): + # Redirect HTTP -> HTTPS (302) for this custom domain + log.debug('Proxito CNAME HTTPS Redirect.', host=host) + request.canonicalize = constants.REDIRECT_HTTPS + + # NOTE: consider redirecting non-canonical custom domains to the canonical one + # Whether that is another custom domain or the public domain + + return project_slug + + # Some person is CNAMEing to us without configuring a domain - 404. + log.debug('CNAME 404.', host=host) + return render( + request, 'core/dns-404.html', context={'host': host}, status=404 + ) + + def map_host_to_project_slug(request): # pylint: disable=too-many-return-statements """ Take the request and map the host to the proper project slug. From 6ac26b071d3bcebc3b3da6f71b067a27066d3eee Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 1 Aug 2022 10:44:01 -0500 Subject: [PATCH 02/21] Remove request --- readthedocs/proxito/middleware.py | 51 ++++++------------------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 624fc6c046c..cbb855f678e 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -22,15 +22,18 @@ log = structlog.get_logger(__name__) # noqa +def _extract_domain_from_netloc(netloc): + return netloc.lower().split(':')[0] + def _unresolve_domain(domain): """ Unresolve domain. We extract the project slug from the domain. """ - host = request.get_host().lower().split(':')[0] - public_domain = settings.PUBLIC_DOMAIN.lower().split(':')[0] - external_domain = settings.RTD_EXTERNAL_VERSION_DOMAIN.lower().split(':')[0] + host = domain + public_domain = _extract_domain_from_netloc(settings.PUBLIC_DOMAIN) + external_domain = _extract_domain_from_netloc(settings.RTD_EXTERNAL_VERSION_DOMAIN) host_parts = host.split('.') public_domain_parts = public_domain.split('.') @@ -38,69 +41,35 @@ def _unresolve_domain(domain): project_slug = None - # Explicit Project slug being passed in - if 'HTTP_X_RTD_SLUG' in request.META: - project_slug = request.META['HTTP_X_RTD_SLUG'].lower() - if Project.objects.filter(slug=project_slug).exists(): - request.rtdheader = True - log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug) - return project_slug - if public_domain in host or host == 'proxito': # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN` if public_domain_parts == host_parts[1:]: project_slug = host_parts[0] - request.subdomain = True log.debug('Proxito Public Domain.', host=host) - if Domain.objects.filter(project__slug=project_slug).filter( - canonical=True, - https=True, - ).exists(): - log.debug('Proxito Public Domain -> Canonical Domain Redirect.', host=host) - request.canonicalize = constants.REDIRECT_CANONICAL_CNAME - elif ( - ProjectRelationship.objects. - filter(child__slug=project_slug).exists() - ): - log.debug('Proxito Public Domain -> Subproject Main Domain Redirect.', host=host) - request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN return project_slug # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example # But these feel like they might be phishing, etc. so let's block them for now. log.warning('Weird variation on our hostname.', host=host) - return render( - request, 'core/dns-404.html', context={'host': host}, status=400 - ) + return None if external_domain in host: # Serve custom versions on external-host-domain if external_domain_parts == host_parts[1:]: try: project_slug, version_slug = host_parts[0].rsplit('--', 1) - request.external_domain = True - request.host_version_slug = version_slug log.debug('Proxito External Version Domain.', host=host) return project_slug except ValueError: log.warning('Weird variation on our hostname.', host=host) - return render( - request, 'core/dns-404.html', context={'host': host}, status=400 - ) + return None # Serve CNAMEs domain = Domain.objects.filter(domain=host).first() if domain: project_slug = domain.project.slug - request.cname = True - request.domain = domain log.debug('Proxito CNAME.', host=host) - if domain.https and not request.is_secure(): - # Redirect HTTP -> HTTPS (302) for this custom domain - log.debug('Proxito CNAME HTTPS Redirect.', host=host) - request.canonicalize = constants.REDIRECT_HTTPS - # NOTE: consider redirecting non-canonical custom domains to the canonical one # Whether that is another custom domain or the public domain @@ -108,9 +77,7 @@ def _unresolve_domain(domain): # Some person is CNAMEing to us without configuring a domain - 404. log.debug('CNAME 404.', host=host) - return render( - request, 'core/dns-404.html', context={'host': host}, status=404 - ) + return None def map_host_to_project_slug(request): # pylint: disable=too-many-return-statements From a7322603c911dd7a8bd59b318cdf3b304d906d14 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 1 Aug 2022 11:00:29 -0500 Subject: [PATCH 03/21] Return tuple --- readthedocs/proxito/middleware.py | 39 ++++++++++++++----------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index cbb855f678e..0622e94a750 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -25,11 +25,14 @@ def _extract_domain_from_netloc(netloc): return netloc.lower().split(':')[0] + def _unresolve_domain(domain): """ Unresolve domain. - We extract the project slug from the domain. + :param str domain: Domain to extrac the project slug from. + :returns: A tuple with the project slug, domain object, and if the domain + is external. """ host = domain public_domain = _extract_domain_from_netloc(settings.PUBLIC_DOMAIN) @@ -39,45 +42,37 @@ def _unresolve_domain(domain): public_domain_parts = public_domain.split('.') external_domain_parts = external_domain.split('.') - project_slug = None - if public_domain in host or host == 'proxito': - # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN` + # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`. if public_domain_parts == host_parts[1:]: project_slug = host_parts[0] log.debug('Proxito Public Domain.', host=host) - return project_slug + return project_slug, None, False - # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example - # But these feel like they might be phishing, etc. so let's block them for now. + # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example, + # but these might be phishing, so let's ignore them for now. log.warning('Weird variation on our hostname.', host=host) - return None + return None, None, False if external_domain in host: # Serve custom versions on external-host-domain if external_domain_parts == host_parts[1:]: try: - project_slug, version_slug = host_parts[0].rsplit('--', 1) + project_slug, _ = host_parts[0].rsplit('--', 1) log.debug('Proxito External Version Domain.', host=host) - return project_slug + return project_slug, None, True except ValueError: log.warning('Weird variation on our hostname.', host=host) - return None + return None, None, False # Serve CNAMEs - domain = Domain.objects.filter(domain=host).first() - if domain: - project_slug = domain.project.slug + domain_object = Domain.objects.filter(domain=host).prefetch_related("project").first() + if domain_object: + project_slug = domain_object.project.slug log.debug('Proxito CNAME.', host=host) + return project_slug, domain_object, False - # NOTE: consider redirecting non-canonical custom domains to the canonical one - # Whether that is another custom domain or the public domain - - return project_slug - - # Some person is CNAMEing to us without configuring a domain - 404. - log.debug('CNAME 404.', host=host) - return None + return None, None, None def map_host_to_project_slug(request): # pylint: disable=too-many-return-statements From 06727241f57b66c41bddfbe6d5e0b37d8298f6b9 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 1 Aug 2022 11:24:31 -0500 Subject: [PATCH 04/21] Replace --- readthedocs/proxito/middleware.py | 96 ++++++++++++++----------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 0622e94a750..7f3c454f72a 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -96,10 +96,6 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem external_domain = settings.RTD_EXTERNAL_VERSION_DOMAIN.lower().split(':')[0] host_parts = host.split('.') - public_domain_parts = public_domain.split('.') - external_domain_parts = external_domain.split('.') - - project_slug = None # Explicit Project slug being passed in if 'HTTP_X_RTD_SLUG' in request.META: @@ -109,57 +105,30 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug) return project_slug - if public_domain in host or host == 'proxito': - # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN` - if public_domain_parts == host_parts[1:]: - project_slug = host_parts[0] - request.subdomain = True - log.debug('Proxito Public Domain.', host=host) - if Domain.objects.filter(project__slug=project_slug).filter( - canonical=True, - https=True, - ).exists(): - log.debug('Proxito Public Domain -> Canonical Domain Redirect.', host=host) - request.canonicalize = constants.REDIRECT_CANONICAL_CNAME - elif ( - ProjectRelationship.objects. - filter(child__slug=project_slug).exists() - ): - log.debug('Proxito Public Domain -> Subproject Main Domain Redirect.', host=host) - request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN - return project_slug - - # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example - # But these feel like they might be phishing, etc. so let's block them for now. - log.warning('Weird variation on our hostname.', host=host) + project_slug, domain_object, external = _unresolve_domain(host) + if not project_slug: + # Block domains that look like ours, may be phishing. + if external_domain in host or public_domain in host: + log.warning("Weird variation on our hostname.", host=host) + return render( + request, + "core/dns-404.html", + context={"host": host}, + status=400, + ) + # Some person is CNAMEing to us without configuring a domain - 404. + log.debug('CNAME 404.', host=host) return render( - request, 'core/dns-404.html', context={'host': host}, status=400 + request, 'core/dns-404.html', context={'host': host}, status=404 ) - if external_domain in host: - # Serve custom versions on external-host-domain - if external_domain_parts == host_parts[1:]: - try: - project_slug, version_slug = host_parts[0].rsplit('--', 1) - request.external_domain = True - request.host_version_slug = version_slug - log.debug('Proxito External Version Domain.', host=host) - return project_slug - except ValueError: - log.warning('Weird variation on our hostname.', host=host) - return render( - request, 'core/dns-404.html', context={'host': host}, status=400 - ) - - # Serve CNAMEs - domain = Domain.objects.filter(domain=host).first() - if domain: - project_slug = domain.project.slug + # Custom domain. + if domain_object: request.cname = True - request.domain = domain + request.domain = domain_object log.debug('Proxito CNAME.', host=host) - if domain.https and not request.is_secure(): + if domain_object.https and not request.is_secure(): # Redirect HTTP -> HTTPS (302) for this custom domain log.debug('Proxito CNAME HTTPS Redirect.', host=host) request.canonicalize = constants.REDIRECT_HTTPS @@ -169,11 +138,30 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem return project_slug - # Some person is CNAMEing to us without configuring a domain - 404. - log.debug('CNAME 404.', host=host) - return render( - request, 'core/dns-404.html', context={'host': host}, status=404 - ) + # Pull request previews. + if external: + _, version_slug = host_parts[0].rsplit('--', 1) + request.external_domain = True + request.host_version_slug = version_slug + log.debug('Proxito External Version Domain.', host=host) + return project_slug + + # Normal doc serving. + request.subdomain = True + log.debug('Proxito Public Domain.', host=host) + if Domain.objects.filter(project__slug=project_slug).filter( + canonical=True, + https=True, + ).exists(): + log.debug('Proxito Public Domain -> Canonical Domain Redirect.', host=host) + request.canonicalize = constants.REDIRECT_CANONICAL_CNAME + elif ( + ProjectRelationship.objects. + filter(child__slug=project_slug).exists() + ): + log.debug('Proxito Public Domain -> Subproject Main Domain Redirect.', host=host) + request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN + return project_slug class ProxitoMiddleware(MiddlewareMixin): From 185771d2fabcdefa6519698d1e43e82e653bcad6 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 1 Aug 2022 12:10:22 -0500 Subject: [PATCH 05/21] Refactor --- readthedocs/proxito/middleware.py | 92 +++++++++++++++---------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 7f3c454f72a..20583c1e5d9 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -22,8 +22,13 @@ log = structlog.get_logger(__name__) # noqa -def _extract_domain_from_netloc(netloc): - return netloc.lower().split(':')[0] +def _get_domain_from_host(host): + """ + Get the normalized domain from a hostname. + + A hostname can include the port. + """ + return host.lower().split(":")[0] def _unresolve_domain(domain): @@ -34,42 +39,37 @@ def _unresolve_domain(domain): :returns: A tuple with the project slug, domain object, and if the domain is external. """ - host = domain - public_domain = _extract_domain_from_netloc(settings.PUBLIC_DOMAIN) - external_domain = _extract_domain_from_netloc(settings.RTD_EXTERNAL_VERSION_DOMAIN) + public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN) + external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) - host_parts = host.split('.') - public_domain_parts = public_domain.split('.') - external_domain_parts = external_domain.split('.') + subdomain, *rest_of_domain = domain.split(".", maxsplit=1) + rest_of_domain = rest_of_domain[0] if rest_of_domain else "" - if public_domain in host or host == 'proxito': + if public_domain in domain: # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`. - if public_domain_parts == host_parts[1:]: - project_slug = host_parts[0] - log.debug('Proxito Public Domain.', host=host) + if public_domain == rest_of_domain: + project_slug = subdomain return project_slug, None, False # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example, # but these might be phishing, so let's ignore them for now. - log.warning('Weird variation on our hostname.', host=host) return None, None, False - if external_domain in host: - # Serve custom versions on external-host-domain - if external_domain_parts == host_parts[1:]: + if external_domain in domain: + # Serve custom versions on external-host-domain. + if external_domain == rest_of_domain: try: - project_slug, _ = host_parts[0].rsplit('--', 1) - log.debug('Proxito External Version Domain.', host=host) + project_slug, _ = subdomain.rsplit("--", maxsplit=1) return project_slug, None, True except ValueError: - log.warning('Weird variation on our hostname.', host=host) return None, None, False - # Serve CNAMEs - domain_object = Domain.objects.filter(domain=host).prefetch_related("project").first() + # Custom domain. + domain_object = ( + Domain.objects.filter(domain=domain).prefetch_related("project").first() + ) if domain_object: project_slug = domain_object.project.slug - log.debug('Proxito CNAME.', host=host) return project_slug, domain_object, False return None, None, None @@ -91,13 +91,11 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem - This sets ``request.canonicalize`` with the value as the reason """ - host = request.get_host().lower().split(':')[0] - public_domain = settings.PUBLIC_DOMAIN.lower().split(':')[0] - external_domain = settings.RTD_EXTERNAL_VERSION_DOMAIN.lower().split(':')[0] - - host_parts = host.split('.') + host = _get_domain_from_host(request.get_host()) + public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN) + external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) - # Explicit Project slug being passed in + # Explicit Project slug being passed in. if 'HTTP_X_RTD_SLUG' in request.META: project_slug = request.META['HTTP_X_RTD_SLUG'].lower() if Project.objects.filter(slug=project_slug).exists(): @@ -117,10 +115,8 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem status=400, ) # Some person is CNAMEing to us without configuring a domain - 404. - log.debug('CNAME 404.', host=host) - return render( - request, 'core/dns-404.html', context={'host': host}, status=404 - ) + log.debug("CNAME 404.", host=host) + return render(request, "core/dns-404.html", context={"host": host}, status=404) # Custom domain. if domain_object: @@ -129,7 +125,7 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem log.debug('Proxito CNAME.', host=host) if domain_object.https and not request.is_secure(): - # Redirect HTTP -> HTTPS (302) for this custom domain + # Redirect HTTP -> HTTPS (302) for this custom domain. log.debug('Proxito CNAME HTTPS Redirect.', host=host) request.canonicalize = constants.REDIRECT_HTTPS @@ -140,26 +136,30 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem # Pull request previews. if external: - _, version_slug = host_parts[0].rsplit('--', 1) + subdomain, _ = host.split(".", maxsplit=1) + _, version_slug = subdomain.rsplit("--", maxsplit=1) request.external_domain = True request.host_version_slug = version_slug - log.debug('Proxito External Version Domain.', host=host) + log.debug("Proxito External Version Domain.", host=host) return project_slug # Normal doc serving. request.subdomain = True - log.debug('Proxito Public Domain.', host=host) - if Domain.objects.filter(project__slug=project_slug).filter( - canonical=True, - https=True, - ).exists(): - log.debug('Proxito Public Domain -> Canonical Domain Redirect.', host=host) - request.canonicalize = constants.REDIRECT_CANONICAL_CNAME - elif ( - ProjectRelationship.objects. - filter(child__slug=project_slug).exists() + log.debug("Proxito Public Domain.", host=host) + if ( + Domain.objects.filter(project__slug=project_slug) + .filter( + canonical=True, + https=True, + ) + .exists() ): - log.debug('Proxito Public Domain -> Subproject Main Domain Redirect.', host=host) + log.debug("Proxito Public Domain -> Canonical Domain Redirect.", host=host) + request.canonicalize = constants.REDIRECT_CANONICAL_CNAME + elif ProjectRelationship.objects.filter(child__slug=project_slug).exists(): + log.debug( + "Proxito Public Domain -> Subproject Main Domain Redirect.", host=host + ) request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN return project_slug From 3d30d6b64a6b03f321ec245ba2e37d0a709f682a Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 9 Aug 2022 11:09:06 -0500 Subject: [PATCH 06/21] Updates from review --- readthedocs/proxito/middleware.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 20583c1e5d9..6f5a9fc71c7 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -33,11 +33,11 @@ def _get_domain_from_host(host): def _unresolve_domain(domain): """ - Unresolve domain. + Unresolve domain by extracting relevant information from it. - :param str domain: Domain to extrac the project slug from. - :returns: A tuple with the project slug, domain object, and if the domain - is external. + :param str domain: Domain to extract the information from. + :returns: A tuple with: the project slug, domain object, and if the domain + is from an external version. """ public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN) external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) From f583d2cba50afc2a5441da70c8f7d2d158ad95f6 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 15 Aug 2022 12:46:42 -0500 Subject: [PATCH 07/21] Move get_domain_from_host to unresolver --- readthedocs/core/unresolver.py | 8 ++++++++ readthedocs/proxito/middleware.py | 19 ++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index beb8590f647..ee51fd183ca 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -88,6 +88,14 @@ def unresolve_from_request(self, request, path): ) return UnresolvedObject(final_project, lang_slug, version_slug, filename, parsed.fragment) + @staticmethod + def get_domain_from_host(host): + """ + Get the normalized domain from a hostname. + + A hostname can include the port. + """ + return host.lower().split(":")[0] class Unresolver(SettingsOverrideObject): diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 6f5a9fc71c7..b7a0a5ea458 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -17,19 +17,12 @@ from readthedocs.core.utils import get_cache_tag from readthedocs.projects.models import Domain, Project, ProjectRelationship +from readthedocs.core.unresolver import unresolver from readthedocs.proxito import constants log = structlog.get_logger(__name__) # noqa -def _get_domain_from_host(host): - """ - Get the normalized domain from a hostname. - - A hostname can include the port. - """ - return host.lower().split(":")[0] - def _unresolve_domain(domain): """ @@ -39,8 +32,8 @@ def _unresolve_domain(domain): :returns: A tuple with: the project slug, domain object, and if the domain is from an external version. """ - public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN) - external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) + public_domain = unresolver.get_domain_from_host(settings.PUBLIC_DOMAIN) + external_domain = unresolver.get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) subdomain, *rest_of_domain = domain.split(".", maxsplit=1) rest_of_domain = rest_of_domain[0] if rest_of_domain else "" @@ -91,9 +84,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem - This sets ``request.canonicalize`` with the value as the reason """ - host = _get_domain_from_host(request.get_host()) - public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN) - external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) + host = unresolver.get_domain_from_host(request.get_host()) + public_domain = unresolver.get_domain_from_host(settings.PUBLIC_DOMAIN) + external_domain = unresolver.get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) # Explicit Project slug being passed in. if 'HTTP_X_RTD_SLUG' in request.META: From 789f74c9f8b4eaa2e81bb7f6ce13703b25d0f17c Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 15 Aug 2022 12:50:48 -0500 Subject: [PATCH 08/21] Move unresolve_domain --- readthedocs/core/unresolver.py | 48 +++++++++++++++++++++++++++++++ readthedocs/proxito/middleware.py | 47 +----------------------------- 2 files changed, 49 insertions(+), 46 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index ee51fd183ca..2d586132859 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -1,4 +1,5 @@ import structlog +from django.conf import settings from collections import namedtuple from urllib.parse import urlparse @@ -8,6 +9,7 @@ from readthedocs.core.utils.extend import SettingsOverrideObject from readthedocs.proxito.middleware import map_host_to_project_slug from readthedocs.proxito.views.mixins import ServeDocsMixin +from readthedocs.projects.models import Domain from readthedocs.proxito.views.utils import _get_project_data_from_request log = structlog.get_logger(__name__) @@ -97,6 +99,52 @@ def get_domain_from_host(host): """ return host.lower().split(":")[0] + # TODO: make this a private method once + # proxito uses the unresolve method directly. + def unresolve_domain(self, domain): + """ + Unresolve domain by extracting relevant information from it. + + :param str domain: Domain to extract the information from. + :returns: A tuple with: the project slug, domain object, and if the domain + is from an external version. + """ + public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN) + external_domain = self.get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) + + subdomain, *rest_of_domain = domain.split(".", maxsplit=1) + rest_of_domain = rest_of_domain[0] if rest_of_domain else "" + + if public_domain in domain: + # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`. + if public_domain == rest_of_domain: + project_slug = subdomain + return project_slug, None, False + + # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example, + # but these might be phishing, so let's ignore them for now. + return None, None, False + + if external_domain in domain: + # Serve custom versions on external-host-domain. + if external_domain == rest_of_domain: + try: + project_slug, _ = subdomain.rsplit("--", maxsplit=1) + return project_slug, None, True + except ValueError: + return None, None, False + + # Custom domain. + domain_object = ( + Domain.objects.filter(domain=domain).prefetch_related("project").first() + ) + if domain_object: + project_slug = domain_object.project.slug + return project_slug, domain_object, False + + return None, None, None + + class Unresolver(SettingsOverrideObject): _default_class = UnresolverBase diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index b7a0a5ea458..5037d2779fd 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -23,51 +23,6 @@ log = structlog.get_logger(__name__) # noqa - -def _unresolve_domain(domain): - """ - Unresolve domain by extracting relevant information from it. - - :param str domain: Domain to extract the information from. - :returns: A tuple with: the project slug, domain object, and if the domain - is from an external version. - """ - public_domain = unresolver.get_domain_from_host(settings.PUBLIC_DOMAIN) - external_domain = unresolver.get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) - - subdomain, *rest_of_domain = domain.split(".", maxsplit=1) - rest_of_domain = rest_of_domain[0] if rest_of_domain else "" - - if public_domain in domain: - # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`. - if public_domain == rest_of_domain: - project_slug = subdomain - return project_slug, None, False - - # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example, - # but these might be phishing, so let's ignore them for now. - return None, None, False - - if external_domain in domain: - # Serve custom versions on external-host-domain. - if external_domain == rest_of_domain: - try: - project_slug, _ = subdomain.rsplit("--", maxsplit=1) - return project_slug, None, True - except ValueError: - return None, None, False - - # Custom domain. - domain_object = ( - Domain.objects.filter(domain=domain).prefetch_related("project").first() - ) - if domain_object: - project_slug = domain_object.project.slug - return project_slug, domain_object, False - - return None, None, None - - def map_host_to_project_slug(request): # pylint: disable=too-many-return-statements """ Take the request and map the host to the proper project slug. @@ -96,7 +51,7 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug) return project_slug - project_slug, domain_object, external = _unresolve_domain(host) + project_slug, domain_object, external = unresolver.unresolve_domain(host) if not project_slug: # Block domains that look like ours, may be phishing. if external_domain in host or public_domain in host: From 067367d0d91fd14f5cd0928742dbb0e6c37283f3 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 15 Aug 2022 15:43:00 -0500 Subject: [PATCH 09/21] Implement unresolve_path --- readthedocs/core/unresolver.py | 157 +++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 2d586132859..5f09137e8cd 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -1,4 +1,6 @@ import structlog +from readthedocs.projects.constants import LANGUAGES +import re from django.conf import settings from collections import namedtuple from urllib.parse import urlparse @@ -20,6 +22,8 @@ class UnresolverBase: + LANGUAGES_CODES = {code for code, _ in LANGUAGES} + def unresolve(self, url): """ Turn a URL into the component parts that our views would use to process them. @@ -90,6 +94,159 @@ def unresolve_from_request(self, request, path): ) return UnresolvedObject(final_project, lang_slug, version_slug, filename, parsed.fragment) + def _match_multiversion_project(self, canonical_project, path): + """ + Try to match a multiversion project. + + If the translation exists, we return a result even if the version doesn't, + so the translation is taken as the canonical project (useful for 404 pages). + """ + # This pattern matches: + # - /en + # - /en/ + # - /en/latest + # - /en/latest/ + # - /en/latest/file/name/ + versioned_pattern = re.compile( + r"^/(?P[^/]+)(/((?P[^/]+)(/(?P.*))?)?)?$" + ) + # if canonical_project.pattern: + # versioned_pattern = re.compile(canonical_project.pattern) + + match = versioned_pattern.match(path) + if not match: + return None + + language = match.group("language") + version_slug = match.group("version") + file = match.group("file") or "/" + + if language not in self.LANGUAGES_CODES: + return None + + if canonical_project.language == language: + project = canonical_project + else: + project = canonical_project.translations.filter( + language=language + ).first() + + if project: + version = project.versions.filter(slug=version_slug).first() + if version: + return project, version, file + return project, None, None + + return None + + def _match_subproject(self, canonical_project, path): + """ + Try to match a subproject. + + If the subproject exists, we try to resolve the rest of the path + with the subproject as the canonical project. + """ + # This pattern matches: + # - /projects/subproject + # - /projects/subproject/ + # - /projects/subproject/file/name/ + subproject_pattern = re.compile( + r"^/projects/(?P[^/]+)(/(?P.*))?$" + ) + # if canonical_project.subproject_pattern: + # subproject_pattern = re.compile(canonical_project.subproject_pattern) + + match = subproject_pattern.match(path) + if not match: + return None + + project_slug = match.group("project") + file = match.group("file") or "/" + # TODO: check if all of our projects have an alias. + project_relationship = ( + canonical_project.subprojects.filter(alias=project_slug) + .prefetch_related("child") + .first() + ) + if project_relationship: + return self._unresolve_path( + canonical_project=project_relationship.child, + path=file, + check_subprojects=False, + ) + return None + + def _match_single_version_project(self, canonical_project, path): + """ + Try to match a single version project. + + With the default pattern any path will match. + """ + # This pattern matches: + # - + # - / + # - /file/name/ + single_version_pattern = re.compile(r"^(?P.+)$") + + # if canonical_project.pattern: + # single_version_pattern = re.compile(canonical_project.pattern) + + match = single_version_pattern.match(path) + if not match: + return None + + file = match.group("file") or "/" + version = canonical_project.versions.filter(slug=canonical_project.default_version).first() + if version: + return canonical_project, version, file + return canonical_project, None, None + + def _unresolve_path(self, canonical_project, path, check_subprojects=True): + """ + Unresolve `path` with `canonical_project` as base. + + If the returned project is `None`, then we weren't able to + unresolve the path into a project. + + If the returned version is `None`, then we weren't able to + unresolve the path into a valid version of the project. + + :param canonical_project: The project that owns the path. + :param path: The path to unresolve. + :param check_subprojects: If we should check for subprojects, + this is used to call this function recursively. + + :returns: A tuple with: project, version, and file name. + """ + # Multiversion project. + if not canonical_project.single_version: + response = self._match_multiversion_project( + canonical_project=canonical_project, + path=path, + ) + if response: + return response + + # Subprojects. + if check_subprojects: + response = self._match_subproject( + canonical_project=canonical_project, + path=path, + ) + if response: + return response + + # Single version project. + if canonical_project.single_version: + response = self._match_single_version_project( + canonical_project=canonical_project, + path=path, + ) + if response: + return response + + return None, None, None + @staticmethod def get_domain_from_host(host): """ From 1cf32b4b68ab14b29937913737b543b4abdf1e2a Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 15 Aug 2022 16:07:57 -0500 Subject: [PATCH 10/21] Use new implementation for unresolver --- readthedocs/core/unresolver.py | 99 +++++++++++++--------------------- 1 file changed, 37 insertions(+), 62 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 5f09137e8cd..77ac72885e4 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -1,98 +1,73 @@ import structlog +from dataclasses import dataclass from readthedocs.projects.constants import LANGUAGES import re from django.conf import settings -from collections import namedtuple from urllib.parse import urlparse -from django.test.client import RequestFactory -from django.urls import resolve as url_resolve +from readthedocs.builds.models import Version from readthedocs.core.utils.extend import SettingsOverrideObject -from readthedocs.proxito.middleware import map_host_to_project_slug -from readthedocs.proxito.views.mixins import ServeDocsMixin -from readthedocs.projects.models import Domain -from readthedocs.proxito.views.utils import _get_project_data_from_request +from readthedocs.projects.models import Domain, Project log = structlog.get_logger(__name__) -UnresolvedObject = namedtuple( - 'Unresolved', 'project, language_slug, version_slug, filename, fragment') + +@dataclass(slots=True) +class UnresolvedURL: + canonical_project: Project + project: Project + version: Version = None + filename: str = None + query: str = None + fragment: str = None + domain: Domain = None + external: bool = False class UnresolverBase: LANGUAGES_CODES = {code for code, _ in LANGUAGES} - def unresolve(self, url): + def unresolve(self, url, normalize_filename=False): """ Turn a URL into the component parts that our views would use to process them. This is useful for lots of places, like where we want to figure out exactly what file a URL maps to. + + :param url: Full URL to unresolve (including the protocol and domain part). + :param normalize_filename: If `True` the filename will be normalized + to end with ``/index.html``. """ parsed = urlparse(url) - domain = parsed.netloc.split(':', 1)[0] - - # TODO: Make this not depend on the request object, - # but instead move all this logic here working on strings. - request = RequestFactory().get(path=parsed.path, HTTP_HOST=domain) - project_slug = map_host_to_project_slug(request) - - # Handle returning a response - if hasattr(project_slug, 'status_code'): + domain = self.get_domain_from_host(parsed.netloc) + project_slug, domain_object, external = self.unresolve_domain(domain) + if not project_slug: return None - request.host_project_slug = request.slug = project_slug - return self.unresolve_from_request(request, url) - - def unresolve_from_request(self, request, path): - """ - Unresolve using a request. - - ``path`` can be a full URL, but the domain will be ignored, - since that information is already in the request object. - - None is returned if the request isn't valid. - """ - parsed = urlparse(path) - path = parsed.path - project_slug = getattr(request, 'host_project_slug', None) - - if not project_slug: + canonical_project = Project.objects.filter(slug=project_slug).first() + if not canonical_project: return None - _, __, kwargs = url_resolve( - path, - urlconf='readthedocs.proxito.urls', + project, version, filename = self._unresolve_path( + canonical_project=canonical_project, + path=parsed.path, ) - mixin = ServeDocsMixin() - version_slug = mixin.get_version_from_host(request, kwargs.get('version_slug')) - - final_project, lang_slug, version_slug, filename = _get_project_data_from_request( # noqa - request, - project_slug=project_slug, - subproject_slug=kwargs.get('subproject_slug'), - lang_slug=kwargs.get('lang_slug'), - version_slug=version_slug, - filename=kwargs.get('filename', ''), - ) + if normalize_filename and filename and filename.endswith("/"): + filename += "index.html" - # Handle our backend storage not supporting directory indexes, - # so we need to append index.html when appropriate. - if not filename or filename.endswith('/'): - # We need to add the index.html to find this actual file - filename += 'index.html' - - log.debug( - 'Unresolver parsed.', - project_slug=final_project.slug, - lang_slug=lang_slug, - version_slug=version_slug, + return UnresolvedURL( + canonical_project=canonical_project, + project=project or canonical_project, + version=version, filename=filename, + query=parsed.query, + fragment=parsed.fragment, + domain=domain_object, + external=external, ) - return UnresolvedObject(final_project, lang_slug, version_slug, filename, parsed.fragment) def _match_multiversion_project(self, canonical_project, path): """ From 53fa8aa20fd12011a379404dcb244a1beeaf58cf Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 15 Aug 2022 17:21:01 -0500 Subject: [PATCH 11/21] Tests --- readthedocs/analytics/proxied_api.py | 9 +- readthedocs/api/mixins.py | 2 +- readthedocs/core/unresolver.py | 17 +- readthedocs/rtd_tests/tests/test_resolver.py | 3 + .../rtd_tests/tests/test_unresolver.py | 213 ++++++++++++++---- 5 files changed, 184 insertions(+), 60 deletions(-) diff --git a/readthedocs/analytics/proxied_api.py b/readthedocs/analytics/proxied_api.py index c9e7cfdeab5..de41d6c72f8 100644 --- a/readthedocs/analytics/proxied_api.py +++ b/readthedocs/analytics/proxied_api.py @@ -8,7 +8,7 @@ from readthedocs.analytics.models import PageView from readthedocs.api.v2.permissions import IsAuthorizedToViewVersion -from readthedocs.core.unresolver import unresolve_from_request +from readthedocs.core.unresolver import unresolve from readthedocs.core.utils.extend import SettingsOverrideObject from readthedocs.projects.models import Project @@ -50,7 +50,6 @@ def get(self, request, *args, **kwargs): version = self._get_version() absolute_uri = self.request.GET.get('absolute_uri') self.increase_page_view_count( - request=request, project=project, version=version, absolute_uri=absolute_uri, @@ -58,10 +57,10 @@ def get(self, request, *args, **kwargs): return Response(status=200) # pylint: disable=no-self-use - def increase_page_view_count(self, request, project, version, absolute_uri): + def increase_page_view_count(self, project, version, absolute_uri): """Increase the page view count for the given project.""" - unresolved = unresolve_from_request(request, absolute_uri) - if not unresolved: + unresolved = unresolve(absolute_uri) + if not unresolved or not unresolved.filename: return path = unresolved.filename diff --git a/readthedocs/api/mixins.py b/readthedocs/api/mixins.py index f73e38d4b5a..1f137646413 100644 --- a/readthedocs/api/mixins.py +++ b/readthedocs/api/mixins.py @@ -99,7 +99,7 @@ def _get_version(self): return if self.unresolved_url: - version_slug = self.unresolved_url.version_slug + version_slug = self.unresolved_url.version.slug else: version_slug = self.request.GET.get("version", "latest") project = self._get_project() diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 77ac72885e4..83b13930b22 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -29,7 +29,7 @@ class UnresolverBase: LANGUAGES_CODES = {code for code, _ in LANGUAGES} - def unresolve(self, url, normalize_filename=False): + def unresolve(self, url, normalize_filename=True): """ Turn a URL into the component parts that our views would use to process them. @@ -69,6 +69,14 @@ def unresolve(self, url, normalize_filename=False): external=external, ) + @staticmethod + def _normalize_filename(filename): + """Normalize filename to always start with ``/``.""" + filename = filename or "/" + if not filename.startswith("/"): + filename = "/" + filename + return filename + def _match_multiversion_project(self, canonical_project, path): """ Try to match a multiversion project. @@ -94,7 +102,7 @@ def _match_multiversion_project(self, canonical_project, path): language = match.group("language") version_slug = match.group("version") - file = match.group("file") or "/" + file = self._normalize_filename(match.group("file")) if language not in self.LANGUAGES_CODES: return None @@ -136,7 +144,7 @@ def _match_subproject(self, canonical_project, path): return None project_slug = match.group("project") - file = match.group("file") or "/" + file = self._normalize_filename(match.group("file")) # TODO: check if all of our projects have an alias. project_relationship = ( canonical_project.subprojects.filter(alias=project_slug) @@ -170,7 +178,7 @@ def _match_single_version_project(self, canonical_project, path): if not match: return None - file = match.group("file") or "/" + file = self._normalize_filename(match.group("file")) version = canonical_project.versions.filter(slug=canonical_project.default_version).first() if version: return canonical_project, version, file @@ -284,4 +292,3 @@ class Unresolver(SettingsOverrideObject): unresolver = Unresolver() unresolve = unresolver.unresolve -unresolve_from_request = unresolver.unresolve_from_request diff --git a/readthedocs/rtd_tests/tests/test_resolver.py b/readthedocs/rtd_tests/tests/test_resolver.py index 9fdc93ccaf0..12309bae00f 100644 --- a/readthedocs/rtd_tests/tests/test_resolver.py +++ b/readthedocs/rtd_tests/tests/test_resolver.py @@ -21,6 +21,7 @@ def setUp(self): users=[self.owner], main_language_project=None, ) + self.version = self.pip.versions.first() self.subproject = fixture.get( Project, slug='sub', @@ -28,6 +29,7 @@ def setUp(self): users=[self.owner], main_language_project=None, ) + self.subproject_version = self.subproject.versions.first() self.translation = fixture.get( Project, slug='trans', @@ -35,6 +37,7 @@ def setUp(self): users=[self.owner], main_language_project=None, ) + self.translation_version = self.translation.versions.first() self.pip.add_subproject(self.subproject) self.pip.translations.add(self.translation) diff --git a/readthedocs/rtd_tests/tests/test_unresolver.py b/readthedocs/rtd_tests/tests/test_unresolver.py index c86a57599b6..6e89180286a 100644 --- a/readthedocs/rtd_tests/tests/test_unresolver.py +++ b/readthedocs/rtd_tests/tests/test_unresolver.py @@ -1,10 +1,13 @@ -from django.test import override_settings import django_dynamic_fixture as fixture import pytest +from django.test import override_settings +from django_dynamic_fixture import get -from readthedocs.rtd_tests.tests.test_resolver import ResolverBase +from readthedocs.builds.constants import EXTERNAL +from readthedocs.builds.models import Version from readthedocs.core.unresolver import unresolve -from readthedocs.projects.models import Domain +from readthedocs.projects.models import Domain, Project +from readthedocs.rtd_tests.tests.test_resolver import ResolverBase @override_settings( @@ -15,68 +18,180 @@ class UnResolverTests(ResolverBase): def test_unresolver(self): - parts = unresolve('http://pip.readthedocs.io/en/latest/foo.html#fragment') - self.assertEqual(parts.project.slug, 'pip') - self.assertEqual(parts.language_slug, 'en') - self.assertEqual(parts.version_slug, 'latest') - self.assertEqual(parts.filename, 'foo.html') - self.assertEqual(parts.fragment, 'fragment') + parts = unresolve('https://pip.readthedocs.io/en/latest/foo.html?search=api#fragment') + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/foo.html") + self.assertEqual(parts.fragment, "fragment") + self.assertEqual(parts.query, "search=api") + + def test_unnormalized_filename(self): + parts = unresolve('https://pip.readthedocs.io/en/latest/file/', normalize_filename=False) + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/file/") + + def test_no_trailing_slash(self): + parts = unresolve("https://pip.readthedocs.io/en/latest") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/index.html") + + def test_trailing_slash(self): + parts = unresolve("https://pip.readthedocs.io/en/latest/") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/index.html") + + def test_file_with_trailing_slash(self): + parts = unresolve("https://pip.readthedocs.io/en/latest/foo/") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/foo/index.html") + + def test_path_no_version(self): + urls = [ + "https://pip.readthedocs.io/en", + "https://pip.readthedocs.io/en/", + ] + for url in urls: + parts = unresolve(url) + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) def test_unresolver_subproject(self): - parts = unresolve('http://pip.readthedocs.io/projects/sub/ja/latest/foo.html') - self.assertEqual(parts.project.slug, 'sub') - self.assertEqual(parts.language_slug, 'ja') - self.assertEqual(parts.version_slug, 'latest') - self.assertEqual(parts.filename, 'foo.html') + parts = unresolve('https://pip.readthedocs.io/projects/sub/ja/latest/foo.html') + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.subproject) + self.assertEqual(parts.version, self.subproject_version) + self.assertEqual(parts.filename, "/foo.html") + + def test_unresolve_subproject_with_translation(self): + subproject_translation = get( + Project, + main_language_project=self.subproject, + language="en", + slug="subproject-translation", + ) + version = subproject_translation.versions.first() + parts = unresolve("https://pip.readthedocs.io/projects/sub/en/latest/foo.html") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, subproject_translation) + self.assertEqual(parts.version, version) + self.assertEqual(parts.filename, "/foo.html") + + def test_unresolve_subproject_single_version(self): + self.subproject.single_version = True + self.subproject.save() + parts = unresolve("https://pip.readthedocs.io/projects/sub/foo.html") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.subproject) + self.assertEqual(parts.version, self.subproject_version) + self.assertEqual(parts.filename, "/foo.html") + + def test_unresolve_subproject_alias(self): + relation = self.pip.subprojects.first() + relation.alias = "sub_alias" + relation.save() + parts = unresolve("https://pip.readthedocs.io/projects/sub_alias/ja/latest/") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.subproject) + self.assertEqual(parts.version, self.subproject_version) + self.assertEqual(parts.filename, "/index.html") def test_unresolver_translation(self): - parts = unresolve('http://pip.readthedocs.io/ja/latest/foo.html') - self.assertEqual(parts.project.slug, 'trans') - self.assertEqual(parts.language_slug, 'ja') - self.assertEqual(parts.version_slug, 'latest') - self.assertEqual(parts.filename, 'foo.html') + parts = unresolve('https://pip.readthedocs.io/ja/latest/foo.html') + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.translation) + self.assertEqual(parts.version, self.translation_version) + self.assertEqual(parts.filename, "/foo.html") + + def test_unresolve_no_existing_translation(self): + parts = unresolve("https://pip.readthedocs.io/es/latest/") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) - def test_unresolver_domain(self): + def test_unresolver_custom_domain(self): self.domain = fixture.get( Domain, domain='docs.foobar.com', project=self.pip, canonical=True, ) - parts = unresolve('http://docs.foobar.com/en/latest/') - self.assertEqual(parts.project.slug, 'pip') - self.assertEqual(parts.language_slug, 'en') - self.assertEqual(parts.version_slug, 'latest') - self.assertEqual(parts.filename, 'index.html') + parts = unresolve('https://docs.foobar.com/en/latest/') + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/index.html") - def test_unresolver_single_version(self): + def test_unresolve_single_version(self): self.pip.single_version = True - parts = unresolve('http://pip.readthedocs.io/') - self.assertEqual(parts.project.slug, 'pip') - self.assertEqual(parts.language_slug, None) - self.assertEqual(parts.version_slug, None) - self.assertEqual(parts.filename, 'index.html') + self.pip.save() + parts = unresolve('https://pip.readthedocs.io/') + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/index.html") - def test_unresolver_subproject_alias(self): - relation = self.pip.subprojects.first() - relation.alias = 'sub_alias' - relation.save() - parts = unresolve('http://pip.readthedocs.io/projects/sub_alias/ja/latest/') - self.assertEqual(parts.project.slug, 'sub') - self.assertEqual(parts.language_slug, 'ja') - self.assertEqual(parts.version_slug, 'latest') - self.assertEqual(parts.filename, 'index.html') + def test_unresolve_single_version_translation_like_path(self): + self.pip.single_version = True + self.pip.save() + parts = unresolve("https://pip.readthedocs.io/en/latest/index.html") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/en/latest/index.html") + + def test_unresolve_single_version_subproject_like_path(self): + self.pip.single_version = True + self.pip.save() + parts = unresolve( + "https://pip.readthedocs.io/projects/other/en/latest/index.html" + ) + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, self.version) + self.assertEqual(parts.filename, "/projects/other/en/latest/index.html") + + def test_unresolve_single_version_subproject(self): + self.pip.single_version = True + self.pip.save() + parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/latest/index.html") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.subproject) + self.assertEqual(parts.version, self.subproject_version) + self.assertEqual(parts.filename, "/index.html") def test_unresolver_external_version(self): - ver = self.pip.versions.first() - ver.type = 'external' - ver.slug = '10' - parts = unresolve('http://pip--10.dev.readthedocs.build/en/10/') - self.assertEqual(parts.project.slug, 'pip') - self.assertEqual(parts.language_slug, 'en') - self.assertEqual(parts.version_slug, '10') - self.assertEqual(parts.filename, 'index.html') + version = get( + Version, + project=self.pip, + type=EXTERNAL, + slug="10", + active=True, + ) + parts = unresolve('https://pip--10.dev.readthedocs.build/en/10/') + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, version) + self.assertEqual(parts.filename, "/index.html") + + def test_unresolve_external_version_no_existing_version(self): + parts = unresolve("https://pip--10.dev.readthedocs.build/en/10/") + self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) def test_unresolver_unknown_host(self): - parts = unresolve('http://random.stuff.com/en/latest/') + parts = unresolve('https://random.stuff.com/en/latest/') self.assertEqual(parts, None) From a62c1495804d4a460d03fd29699c7dc15527b276 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 15 Aug 2022 17:51:53 -0500 Subject: [PATCH 12/21] Refactor and cleanup --- readthedocs/core/unresolver.py | 119 ++++++++---------- readthedocs/proxito/middleware.py | 6 +- .../rtd_tests/tests/test_unresolver.py | 24 ++-- 3 files changed, 70 insertions(+), 79 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 83b13930b22..750d60c19f6 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -1,13 +1,12 @@ -import structlog -from dataclasses import dataclass -from readthedocs.projects.constants import LANGUAGES import re -from django.conf import settings +from dataclasses import dataclass from urllib.parse import urlparse -from readthedocs.builds.models import Version +import structlog +from django.conf import settings -from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.builds.models import Version +from readthedocs.constants import pattern_opts from readthedocs.projects.models import Domain, Project log = structlog.get_logger(__name__) @@ -15,8 +14,17 @@ @dataclass(slots=True) class UnresolvedURL: + + """Dataclass with the parts of an unresolved URL.""" + + # This is the project that owns the domain, + # this usually is the parent project of a translation or subproject. canonical_project: Project + + # The current project we are serving the docs from. + # It can be the same as canonical_project. project: Project + version: Version = None filename: str = None query: str = None @@ -25,11 +33,30 @@ class UnresolvedURL: external: bool = False -class UnresolverBase: - - LANGUAGES_CODES = {code for code, _ in LANGUAGES} +class Unresolver: + # This pattern matches: + # - /en + # - /en/ + # - /en/latest + # - /en/latest/ + # - /en/latest/file/name/ + multiversion_pattern = re.compile( + r"^/(?P{lang_slug})(/((?P{version_slug})(/(?P{filename_slug}))?)?)?$".format( # noqa + **pattern_opts + ) + ) + + # This pattern matches: + # - /projects/subproject + # - /projects/subproject/ + # - /projects/subproject/file/name/ + subproject_pattern = re.compile( + r"^/projects/(?P{project_slug}+)(/(?P{filename_slug}))?$".format( + **pattern_opts + ) + ) - def unresolve(self, url, normalize_filename=True): + def unresolve(self, url, add_index=True): """ Turn a URL into the component parts that our views would use to process them. @@ -37,7 +64,7 @@ def unresolve(self, url, normalize_filename=True): like where we want to figure out exactly what file a URL maps to. :param url: Full URL to unresolve (including the protocol and domain part). - :param normalize_filename: If `True` the filename will be normalized + :param add_index: If `True` the filename will be normalized to end with ``/index.html``. """ parsed = urlparse(url) @@ -55,7 +82,7 @@ def unresolve(self, url, normalize_filename=True): path=parsed.path, ) - if normalize_filename and filename and filename.endswith("/"): + if add_index and filename and filename.endswith("/"): filename += "index.html" return UnresolvedURL( @@ -84,19 +111,7 @@ def _match_multiversion_project(self, canonical_project, path): If the translation exists, we return a result even if the version doesn't, so the translation is taken as the canonical project (useful for 404 pages). """ - # This pattern matches: - # - /en - # - /en/ - # - /en/latest - # - /en/latest/ - # - /en/latest/file/name/ - versioned_pattern = re.compile( - r"^/(?P[^/]+)(/((?P[^/]+)(/(?P.*))?)?)?$" - ) - # if canonical_project.pattern: - # versioned_pattern = re.compile(canonical_project.pattern) - - match = versioned_pattern.match(path) + match = self.multiversion_pattern.match(path) if not match: return None @@ -104,15 +119,10 @@ def _match_multiversion_project(self, canonical_project, path): version_slug = match.group("version") file = self._normalize_filename(match.group("file")) - if language not in self.LANGUAGES_CODES: - return None - if canonical_project.language == language: project = canonical_project else: - project = canonical_project.translations.filter( - language=language - ).first() + project = canonical_project.translations.filter(language=language).first() if project: version = project.versions.filter(slug=version_slug).first() @@ -129,23 +139,12 @@ def _match_subproject(self, canonical_project, path): If the subproject exists, we try to resolve the rest of the path with the subproject as the canonical project. """ - # This pattern matches: - # - /projects/subproject - # - /projects/subproject/ - # - /projects/subproject/file/name/ - subproject_pattern = re.compile( - r"^/projects/(?P[^/]+)(/(?P.*))?$" - ) - # if canonical_project.subproject_pattern: - # subproject_pattern = re.compile(canonical_project.subproject_pattern) - - match = subproject_pattern.match(path) + match = self.subproject_pattern.match(path) if not match: return None project_slug = match.group("project") file = self._normalize_filename(match.group("file")) - # TODO: check if all of our projects have an alias. project_relationship = ( canonical_project.subprojects.filter(alias=project_slug) .prefetch_related("child") @@ -163,23 +162,12 @@ def _match_single_version_project(self, canonical_project, path): """ Try to match a single version project. - With the default pattern any path will match. + By default any path will match. """ - # This pattern matches: - # - - # - / - # - /file/name/ - single_version_pattern = re.compile(r"^(?P.+)$") - - # if canonical_project.pattern: - # single_version_pattern = re.compile(canonical_project.pattern) - - match = single_version_pattern.match(path) - if not match: - return None - - file = self._normalize_filename(match.group("file")) - version = canonical_project.versions.filter(slug=canonical_project.default_version).first() + file = self._normalize_filename(path) + version = canonical_project.versions.filter( + slug=canonical_project.default_version + ).first() if version: return canonical_project, version, file return canonical_project, None, None @@ -250,7 +238,9 @@ def unresolve_domain(self, domain): is from an external version. """ public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN) - external_domain = self.get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) + external_domain = self.get_domain_from_host( + settings.RTD_EXTERNAL_VERSION_DOMAIN + ) subdomain, *rest_of_domain = domain.split(".", maxsplit=1) rest_of_domain = rest_of_domain[0] if rest_of_domain else "" @@ -261,8 +251,8 @@ def unresolve_domain(self, domain): project_slug = subdomain return project_slug, None, False - # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example, - # but these might be phishing, so let's ignore them for now. + # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) + # for example, but these might be phishing, so let's ignore them for now. return None, None, False if external_domain in domain: @@ -285,10 +275,5 @@ def unresolve_domain(self, domain): return None, None, None -class Unresolver(SettingsOverrideObject): - - _default_class = UnresolverBase - - unresolver = Unresolver() unresolve = unresolver.unresolve diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 5037d2779fd..05e62839f80 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -15,9 +15,9 @@ from django.urls import reverse from django.utils.deprecation import MiddlewareMixin +from readthedocs.core.unresolver import unresolver from readthedocs.core.utils import get_cache_tag from readthedocs.projects.models import Domain, Project, ProjectRelationship -from readthedocs.core.unresolver import unresolver from readthedocs.proxito import constants log = structlog.get_logger(__name__) # noqa @@ -41,7 +41,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem host = unresolver.get_domain_from_host(request.get_host()) public_domain = unresolver.get_domain_from_host(settings.PUBLIC_DOMAIN) - external_domain = unresolver.get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN) + external_domain = unresolver.get_domain_from_host( + settings.RTD_EXTERNAL_VERSION_DOMAIN + ) # Explicit Project slug being passed in. if 'HTTP_X_RTD_SLUG' in request.META: diff --git a/readthedocs/rtd_tests/tests/test_unresolver.py b/readthedocs/rtd_tests/tests/test_unresolver.py index 6e89180286a..e38342c35ed 100644 --- a/readthedocs/rtd_tests/tests/test_unresolver.py +++ b/readthedocs/rtd_tests/tests/test_unresolver.py @@ -18,7 +18,9 @@ class UnResolverTests(ResolverBase): def test_unresolver(self): - parts = unresolve('https://pip.readthedocs.io/en/latest/foo.html?search=api#fragment') + parts = unresolve( + "https://pip.readthedocs.io/en/latest/foo.html?search=api#fragment" + ) self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) @@ -26,8 +28,8 @@ def test_unresolver(self): self.assertEqual(parts.fragment, "fragment") self.assertEqual(parts.query, "search=api") - def test_unnormalized_filename(self): - parts = unresolve('https://pip.readthedocs.io/en/latest/file/', normalize_filename=False) + def test_filename_wihtout_index(self): + parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False) self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) @@ -67,7 +69,7 @@ def test_path_no_version(self): self.assertEqual(parts.filename, None) def test_unresolver_subproject(self): - parts = unresolve('https://pip.readthedocs.io/projects/sub/ja/latest/foo.html') + parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/latest/foo.html") self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.subproject) self.assertEqual(parts.version, self.subproject_version) @@ -107,7 +109,7 @@ def test_unresolve_subproject_alias(self): self.assertEqual(parts.filename, "/index.html") def test_unresolver_translation(self): - parts = unresolve('https://pip.readthedocs.io/ja/latest/foo.html') + parts = unresolve("https://pip.readthedocs.io/ja/latest/foo.html") self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.translation) self.assertEqual(parts.version, self.translation_version) @@ -127,7 +129,7 @@ def test_unresolver_custom_domain(self): project=self.pip, canonical=True, ) - parts = unresolve('https://docs.foobar.com/en/latest/') + parts = unresolve("https://docs.foobar.com/en/latest/") self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) @@ -136,7 +138,7 @@ def test_unresolver_custom_domain(self): def test_unresolve_single_version(self): self.pip.single_version = True self.pip.save() - parts = unresolve('https://pip.readthedocs.io/') + parts = unresolve("https://pip.readthedocs.io/") self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) @@ -165,7 +167,9 @@ def test_unresolve_single_version_subproject_like_path(self): def test_unresolve_single_version_subproject(self): self.pip.single_version = True self.pip.save() - parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/latest/index.html") + parts = unresolve( + "https://pip.readthedocs.io/projects/sub/ja/latest/index.html" + ) self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.subproject) self.assertEqual(parts.version, self.subproject_version) @@ -179,7 +183,7 @@ def test_unresolver_external_version(self): slug="10", active=True, ) - parts = unresolve('https://pip--10.dev.readthedocs.build/en/10/') + parts = unresolve("https://pip--10.dev.readthedocs.build/en/10/") self.assertEqual(parts.canonical_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, version) @@ -193,5 +197,5 @@ def test_unresolve_external_version_no_existing_version(self): self.assertEqual(parts.filename, None) def test_unresolver_unknown_host(self): - parts = unresolve('https://random.stuff.com/en/latest/') + parts = unresolve("https://random.stuff.com/en/latest/") self.assertEqual(parts, None) From aba6fc4be01534be9294f00b3dcde8c57f40c311 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 18 Aug 2022 13:04:47 -0500 Subject: [PATCH 13/21] Rename canonical_project -> parent_project --- readthedocs/core/unresolver.py | 54 +++++++++---------- .../rtd_tests/tests/test_unresolver.py | 38 ++++++------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 874b1b2c011..d067b78ab84 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -19,10 +19,10 @@ class UnresolvedURL: # This is the project that owns the domain, # this usually is the parent project of a translation or subproject. - canonical_project: Project + parent_project: Project # The current project we are serving the docs from. - # It can be the same as canonical_project. + # It can be the same as parent_project. project: Project version: Version = None @@ -73,12 +73,12 @@ def unresolve(self, url, add_index=True): if not project_slug: return None - canonical_project = Project.objects.filter(slug=project_slug).first() - if not canonical_project: + parent_project = Project.objects.filter(slug=project_slug).first() + if not parent_project: return None project, version, filename = self._unresolve_path( - canonical_project=canonical_project, + parent_project=parent_project, path=parsed.path, ) @@ -86,8 +86,8 @@ def unresolve(self, url, add_index=True): filename += "index.html" return UnresolvedURL( - canonical_project=canonical_project, - project=project or canonical_project, + parent_project=parent_project, + project=project or parent_project, version=version, filename=filename, query=parsed.query, @@ -104,7 +104,7 @@ def _normalize_filename(filename): filename = "/" + filename return filename - def _match_multiversion_project(self, canonical_project, path): + def _match_multiversion_project(self, parent_project, path): """ Try to match a multiversion project. @@ -119,10 +119,10 @@ def _match_multiversion_project(self, canonical_project, path): version_slug = match.group("version") file = self._normalize_filename(match.group("file")) - if canonical_project.language == language: - project = canonical_project + if parent_project.language == language: + project = parent_project else: - project = canonical_project.translations.filter(language=language).first() + project = parent_project.translations.filter(language=language).first() if project: version = project.versions.filter(slug=version_slug).first() @@ -132,7 +132,7 @@ def _match_multiversion_project(self, canonical_project, path): return None - def _match_subproject(self, canonical_project, path): + def _match_subproject(self, parent_project, path): """ Try to match a subproject. @@ -146,35 +146,35 @@ def _match_subproject(self, canonical_project, path): project_slug = match.group("project") file = self._normalize_filename(match.group("file")) project_relationship = ( - canonical_project.subprojects.filter(alias=project_slug) + parent_project.subprojects.filter(alias=project_slug) .prefetch_related("child") .first() ) if project_relationship: return self._unresolve_path( - canonical_project=project_relationship.child, + parent_project=project_relationship.child, path=file, check_subprojects=False, ) return None - def _match_single_version_project(self, canonical_project, path): + def _match_single_version_project(self, parent_project, path): """ Try to match a single version project. By default any path will match. """ file = self._normalize_filename(path) - version = canonical_project.versions.filter( - slug=canonical_project.default_version + version = parent_project.versions.filter( + slug=parent_project.default_version ).first() if version: - return canonical_project, version, file - return canonical_project, None, None + return parent_project, version, file + return parent_project, None, None - def _unresolve_path(self, canonical_project, path, check_subprojects=True): + def _unresolve_path(self, parent_project, path, check_subprojects=True): """ - Unresolve `path` with `canonical_project` as base. + Unresolve `path` with `parent_project` as base. If the returned project is `None`, then we weren't able to unresolve the path into a project. @@ -182,7 +182,7 @@ def _unresolve_path(self, canonical_project, path, check_subprojects=True): If the returned version is `None`, then we weren't able to unresolve the path into a valid version of the project. - :param canonical_project: The project that owns the path. + :param parent_project: The project that owns the path. :param path: The path to unresolve. :param check_subprojects: If we should check for subprojects, this is used to call this function recursively. @@ -190,9 +190,9 @@ def _unresolve_path(self, canonical_project, path, check_subprojects=True): :returns: A tuple with: project, version, and file name. """ # Multiversion project. - if not canonical_project.single_version: + if not parent_project.single_version: response = self._match_multiversion_project( - canonical_project=canonical_project, + parent_project=parent_project, path=path, ) if response: @@ -201,16 +201,16 @@ def _unresolve_path(self, canonical_project, path, check_subprojects=True): # Subprojects. if check_subprojects: response = self._match_subproject( - canonical_project=canonical_project, + parent_project=parent_project, path=path, ) if response: return response # Single version project. - if canonical_project.single_version: + if parent_project.single_version: response = self._match_single_version_project( - canonical_project=canonical_project, + parent_project=parent_project, path=path, ) if response: diff --git a/readthedocs/rtd_tests/tests/test_unresolver.py b/readthedocs/rtd_tests/tests/test_unresolver.py index e38342c35ed..2a7421dc90c 100644 --- a/readthedocs/rtd_tests/tests/test_unresolver.py +++ b/readthedocs/rtd_tests/tests/test_unresolver.py @@ -21,7 +21,7 @@ def test_unresolver(self): parts = unresolve( "https://pip.readthedocs.io/en/latest/foo.html?search=api#fragment" ) - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/foo.html") @@ -30,28 +30,28 @@ def test_unresolver(self): def test_filename_wihtout_index(self): parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False) - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/file/") def test_no_trailing_slash(self): parts = unresolve("https://pip.readthedocs.io/en/latest") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/index.html") def test_trailing_slash(self): parts = unresolve("https://pip.readthedocs.io/en/latest/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/index.html") def test_file_with_trailing_slash(self): parts = unresolve("https://pip.readthedocs.io/en/latest/foo/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/foo/index.html") @@ -63,14 +63,14 @@ def test_path_no_version(self): ] for url in urls: parts = unresolve(url) - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, None) self.assertEqual(parts.filename, None) def test_unresolver_subproject(self): parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/latest/foo.html") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.subproject) self.assertEqual(parts.version, self.subproject_version) self.assertEqual(parts.filename, "/foo.html") @@ -84,7 +84,7 @@ def test_unresolve_subproject_with_translation(self): ) version = subproject_translation.versions.first() parts = unresolve("https://pip.readthedocs.io/projects/sub/en/latest/foo.html") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, subproject_translation) self.assertEqual(parts.version, version) self.assertEqual(parts.filename, "/foo.html") @@ -93,7 +93,7 @@ def test_unresolve_subproject_single_version(self): self.subproject.single_version = True self.subproject.save() parts = unresolve("https://pip.readthedocs.io/projects/sub/foo.html") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.subproject) self.assertEqual(parts.version, self.subproject_version) self.assertEqual(parts.filename, "/foo.html") @@ -103,21 +103,21 @@ def test_unresolve_subproject_alias(self): relation.alias = "sub_alias" relation.save() parts = unresolve("https://pip.readthedocs.io/projects/sub_alias/ja/latest/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.subproject) self.assertEqual(parts.version, self.subproject_version) self.assertEqual(parts.filename, "/index.html") def test_unresolver_translation(self): parts = unresolve("https://pip.readthedocs.io/ja/latest/foo.html") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.translation) self.assertEqual(parts.version, self.translation_version) self.assertEqual(parts.filename, "/foo.html") def test_unresolve_no_existing_translation(self): parts = unresolve("https://pip.readthedocs.io/es/latest/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, None) self.assertEqual(parts.filename, None) @@ -130,7 +130,7 @@ def test_unresolver_custom_domain(self): canonical=True, ) parts = unresolve("https://docs.foobar.com/en/latest/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/index.html") @@ -139,7 +139,7 @@ def test_unresolve_single_version(self): self.pip.single_version = True self.pip.save() parts = unresolve("https://pip.readthedocs.io/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/index.html") @@ -148,7 +148,7 @@ def test_unresolve_single_version_translation_like_path(self): self.pip.single_version = True self.pip.save() parts = unresolve("https://pip.readthedocs.io/en/latest/index.html") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/en/latest/index.html") @@ -159,7 +159,7 @@ def test_unresolve_single_version_subproject_like_path(self): parts = unresolve( "https://pip.readthedocs.io/projects/other/en/latest/index.html" ) - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/projects/other/en/latest/index.html") @@ -170,7 +170,7 @@ def test_unresolve_single_version_subproject(self): parts = unresolve( "https://pip.readthedocs.io/projects/sub/ja/latest/index.html" ) - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.subproject) self.assertEqual(parts.version, self.subproject_version) self.assertEqual(parts.filename, "/index.html") @@ -184,14 +184,14 @@ def test_unresolver_external_version(self): active=True, ) parts = unresolve("https://pip--10.dev.readthedocs.build/en/10/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, version) self.assertEqual(parts.filename, "/index.html") def test_unresolve_external_version_no_existing_version(self): parts = unresolve("https://pip--10.dev.readthedocs.build/en/10/") - self.assertEqual(parts.canonical_project, self.pip) + self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, None) self.assertEqual(parts.filename, None) From cfe16b71289531afda4fbaa45b1dec705c69c39c Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 18 Aug 2022 13:06:55 -0500 Subject: [PATCH 14/21] Lint --- readthedocs/core/unresolver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index d067b78ab84..2a9b623b5da 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -252,8 +252,8 @@ def unresolve_domain(self, domain): log.info("Public domain.", domain=domain) return project_slug, None, False - # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example, - # but these might be phishing, so let's ignore them for now. + # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) + # for example, but these might be phishing, so let's ignore them for now. log.warning("Weird variation of our domain.", domain=domain) return None, None, False @@ -280,6 +280,5 @@ def unresolve_domain(self, domain): return None, None, None - unresolver = Unresolver() unresolve = unresolver.unresolve From 690072487a282ba047a28939b0f644db6c314ef0 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 18 Aug 2022 15:50:24 -0500 Subject: [PATCH 15/21] Fix --- readthedocs/embed/v3/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py index 6e05cf9e603..db5792f53e3 100644 --- a/readthedocs/embed/v3/views.py +++ b/readthedocs/embed/v3/views.py @@ -107,7 +107,7 @@ def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion): page_content = self._download_page_content(url) else: project = self.unresolved_url.project - version_slug = self.unresolved_url.version_slug + version_slug = self.unresolved_url.version.slug filename = self.unresolved_url.filename page_content = self._get_page_content_from_storage(project, version_slug, filename) From 1e3548e4a81dd397e7a54a7ae419d91d20a13e05 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 22 Aug 2022 18:09:55 -0500 Subject: [PATCH 16/21] Unresolver: strict validation for external versions and other fixes - At first I was thinking to add this validation outside the unresolver, but I think it makes sense to be in the unresolver. - This also has a fix for partial subproject matches. - Instead of returning the query and the fragment separately, just return the parsed URL, I think it can useful to have the full path without having to call urllib.parse again. - Just return the extracted external version from unresolve_domain, so we don't have to parse the subdomain again. --- readthedocs/core/unresolver.py | 63 +++++++++++++------ readthedocs/embed/views.py | 2 +- readthedocs/proxito/middleware.py | 10 +-- .../rtd_tests/tests/test_unresolver.py | 36 ++++++++++- 4 files changed, 85 insertions(+), 26 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 2a9b623b5da..d9f5f33663e 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -1,6 +1,6 @@ import re from dataclasses import dataclass -from urllib.parse import urlparse +from urllib.parse import ParseResult, urlparse import structlog from django.conf import settings @@ -27,8 +27,7 @@ class UnresolvedURL: version: Version = None filename: str = None - query: str = None - fragment: str = None + parsed_url: ParseResult = None domain: Domain = None external: bool = False @@ -69,7 +68,9 @@ def unresolve(self, url, add_index=True): """ parsed = urlparse(url) domain = self.get_domain_from_host(parsed.netloc) - project_slug, domain_object, external = self.unresolve_domain(domain) + project_slug, domain_object, external_version_slug = self.unresolve_domain( + domain + ) if not project_slug: return None @@ -82,6 +83,26 @@ def unresolve(self, url, add_index=True): path=parsed.path, ) + # Make sure we are serving the external version from the subdomain. + if external_version_slug and version: + if external_version_slug != version.slug: + log.warning( + "Invalid version for external domain.", + domain=domain, + version_slug=version.slug, + ) + version = None + filename = None + elif not version.is_external: + log.warning( + "Version is not external.", + domain=domain, + version_slug=version.slug, + version_type=version.type, + ) + version = None + filename = None + if add_index and filename and filename.endswith("/"): filename += "index.html" @@ -90,10 +111,9 @@ def unresolve(self, url, add_index=True): project=project or parent_project, version=version, filename=filename, - query=parsed.query, - fragment=parsed.fragment, + parsed_url=parsed, domain=domain_object, - external=external, + external=bool(external_version_slug), ) @staticmethod @@ -109,7 +129,7 @@ def _match_multiversion_project(self, parent_project, path): Try to match a multiversion project. If the translation exists, we return a result even if the version doesn't, - so the translation is taken as the canonical project (useful for 404 pages). + so the translation is taken as the current project (useful for 404 pages). """ match = self.multiversion_pattern.match(path) if not match: @@ -138,6 +158,9 @@ def _match_subproject(self, parent_project, path): If the subproject exists, we try to resolve the rest of the path with the subproject as the canonical project. + + If the subproject exists, we return a result even if version doesn't, + so the subproject is taken as the current project (useful for 404 pages). """ match = self.subproject_pattern.match(path) if not match: @@ -151,11 +174,15 @@ def _match_subproject(self, parent_project, path): .first() ) if project_relationship: - return self._unresolve_path( - parent_project=project_relationship.child, + subproject = project_relationship.child + response = self._unresolve_path( + parent_project=subproject, path=file, check_subprojects=False, ) + if response: + return response + return subproject, None, None return None def _match_single_version_project(self, parent_project, path): @@ -216,7 +243,7 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True): if response: return response - return None, None, None + return parent_project, None, None @staticmethod def get_domain_from_host(host): @@ -234,8 +261,8 @@ def unresolve_domain(self, domain): Unresolve domain by extracting relevant information from it. :param str domain: Domain to extract the information from. - :returns: A tuple with: the project slug, domain object, and if the domain - is from an external version. + :returns: A tuple with: the project slug, domain object, and the + external version slug if the domain is from an external version. """ public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN) external_domain = self.get_domain_from_host( @@ -250,22 +277,22 @@ def unresolve_domain(self, domain): if public_domain == root_domain: project_slug = subdomain log.info("Public domain.", domain=domain) - return project_slug, None, False + return project_slug, None, None # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) # for example, but these might be phishing, so let's ignore them for now. log.warning("Weird variation of our domain.", domain=domain) - return None, None, False + return None, None, None # Serve PR builds on external_domain host. if external_domain == root_domain: try: + project_slug, version_slug = subdomain.rsplit("--", maxsplit=1) log.info("External versions domain.", domain=domain) - project_slug, _ = subdomain.rsplit("--", maxsplit=1) - return project_slug, None, True + return project_slug, None, version_slug except ValueError: log.info("Invalid format of external versions domain.", domain=domain) - return None, None, False + return None, None, None # Custom domain. domain_object = ( diff --git a/readthedocs/embed/views.py b/readthedocs/embed/views.py index 52e79fa79ad..5580c1e5531 100644 --- a/readthedocs/embed/views.py +++ b/readthedocs/embed/views.py @@ -81,7 +81,7 @@ def get(self, request): if url: unresolved = self.unresolved_url path = unresolved.filename - section = unresolved.fragment + section = unresolved.parsed_url.fragment elif not path and not doc: return Response( { diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index 05e62839f80..a761e7bbbc2 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -53,7 +53,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug) return project_slug - project_slug, domain_object, external = unresolver.unresolve_domain(host) + project_slug, domain_object, external_version_slug = unresolver.unresolve_domain( + host + ) if not project_slug: # Block domains that look like ours, may be phishing. if external_domain in host or public_domain in host: @@ -85,11 +87,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem return project_slug # Pull request previews. - if external: - subdomain, _ = host.split(".", maxsplit=1) - _, version_slug = subdomain.rsplit("--", maxsplit=1) + if external_version_slug: request.external_domain = True - request.host_version_slug = version_slug + request.host_version_slug = external_version_slug log.debug("Proxito External Version Domain.", host=host) return project_slug diff --git a/readthedocs/rtd_tests/tests/test_unresolver.py b/readthedocs/rtd_tests/tests/test_unresolver.py index 2a7421dc90c..63d7d182e75 100644 --- a/readthedocs/rtd_tests/tests/test_unresolver.py +++ b/readthedocs/rtd_tests/tests/test_unresolver.py @@ -25,8 +25,8 @@ def test_unresolver(self): self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) self.assertEqual(parts.filename, "/foo.html") - self.assertEqual(parts.fragment, "fragment") - self.assertEqual(parts.query, "search=api") + self.assertEqual(parts.parsed_url.fragment, "fragment") + self.assertEqual(parts.parsed_url.query, "search=api") def test_filename_wihtout_index(self): parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False) @@ -108,6 +108,20 @@ def test_unresolve_subproject_alias(self): self.assertEqual(parts.version, self.subproject_version) self.assertEqual(parts.filename, "/index.html") + def test_unresolve_subproject_invalid_version(self): + parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/nothing/foo.html") + self.assertEqual(parts.parent_project, self.pip) + self.assertEqual(parts.project, self.subproject) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) + + def test_unresolve_subproject_invalid_translation(self): + parts = unresolve("https://pip.readthedocs.io/projects/sub/es/latest/foo.html") + self.assertEqual(parts.parent_project, self.pip) + self.assertEqual(parts.project, self.subproject) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) + def test_unresolver_translation(self): parts = unresolve("https://pip.readthedocs.io/ja/latest/foo.html") self.assertEqual(parts.parent_project, self.pip) @@ -196,6 +210,24 @@ def test_unresolve_external_version_no_existing_version(self): self.assertEqual(parts.version, None) self.assertEqual(parts.filename, None) + def test_external_version_not_matching_final_version(self): + parts = unresolve("https://pip--10.dev.readthedocs.build/en/latest/") + self.assertEqual(parts.parent_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) + + def test_normal_version_in_external_version_subdomain(self): + parts = unresolve("https://pip--latest.dev.readthedocs.build/en/latest/") + self.assertEqual(parts.parent_project, self.pip) + self.assertEqual(parts.project, self.pip) + self.assertEqual(parts.version, None) + self.assertEqual(parts.filename, None) + + def test_malformed_external_version(self): + parts = unresolve("https://pip-latest.dev.readthedocs.build/en/latest/") + self.assertEqual(parts, None) + def test_unresolver_unknown_host(self): parts = unresolve("https://random.stuff.com/en/latest/") self.assertEqual(parts, None) From 8563286690af199924c8ebd983758d7d8c789d33 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 23 Aug 2022 16:47:54 -0500 Subject: [PATCH 17/21] Updates from review --- readthedocs/core/unresolver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index d9f5f33663e..0724be7c5b5 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -180,6 +180,9 @@ def _match_subproject(self, parent_project, path): path=file, check_subprojects=False, ) + # If we got a valid response, return that, + # otherwise return the current subproject + # as the current project without a valid version or path. if response: return response return subproject, None, None From 8fad1462cbc8aa6a6107e1a375e56ed2ff80be44 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 13 Sep 2022 16:41:20 -0500 Subject: [PATCH 18/21] Updates from review --- readthedocs/core/unresolver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 86c841d6cad..df742dc4336 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -95,10 +95,11 @@ def unresolve(self, url, add_index=True): filename = None elif not version.is_external: log.warning( - "Version is not external.", + "Attempt of serving a non-external version from RTD_EXTERNAL_VERSION_DOMAIN.", domain=domain, version_slug=version.slug, version_type=version.type, + url=url, ) version = None filename = None @@ -174,6 +175,8 @@ def _match_subproject(self, parent_project, path): .first() ) if project_relationship: + # We use the subproject as the new parent project + # to resolve the rest of the path relative to it. subproject = project_relationship.child response = self._unresolve_path( parent_project=subproject, @@ -279,7 +282,7 @@ def unresolve_domain(self, domain): # Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`. if public_domain == root_domain: project_slug = subdomain - log.info("Public domain.", domain=domain) + log.debug("Public domain.", domain=domain) return project_slug, None, None # TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) @@ -291,7 +294,7 @@ def unresolve_domain(self, domain): if external_domain == root_domain: try: project_slug, version_slug = subdomain.rsplit("--", maxsplit=1) - log.info("External versions domain.", domain=domain) + log.debug("External versions domain.", domain=domain) return project_slug, None, version_slug except ValueError: log.info("Invalid format of external versions domain.", domain=domain) From 4d4106929bb5f5b917f10cf713d8f38c6f09d4e5 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 14 Sep 2022 13:53:03 -0500 Subject: [PATCH 19/21] Updates from review from https://github.com/readthedocs/readthedocs.org/pull/9500 --- readthedocs/core/unresolver.py | 50 ++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index df742dc4336..d4fc57eb12e 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -40,9 +40,11 @@ class Unresolver: # - /en/latest/ # - /en/latest/file/name/ multiversion_pattern = re.compile( - r"^/(?P{lang_slug})(/((?P{version_slug})(/(?P{filename_slug}))?)?)?$".format( # noqa - **pattern_opts - ) + r""" + ^/(?P{lang_slug}) # Must have the language slug. + (/((?P{version_slug})(/(?P{filename_slug}))?)?)?$ # Optionally a version followed by a file. # noqa + """.format(**pattern_opts), + re.VERBOSE, ) # This pattern matches: @@ -50,9 +52,12 @@ class Unresolver: # - /projects/subproject/ # - /projects/subproject/file/name/ subproject_pattern = re.compile( - r"^/projects/(?P{project_slug}+)(/(?P{filename_slug}))?$".format( - **pattern_opts - ) + r""" + ^/projects/ # Must have the `projects` prefix. + (?P{project_slug}+) # Followed by the subproject alias. + (/(?P{filename_slug}))?$ # Optionally a filename, which will be recursively resolved. + """.format(**pattern_opts), + re.VERBOSE, ) def unresolve(self, url, add_index=True): @@ -68,17 +73,17 @@ def unresolve(self, url, add_index=True): """ parsed = urlparse(url) domain = self.get_domain_from_host(parsed.netloc) - project_slug, domain_object, external_version_slug = self.unresolve_domain( + parent_project_slug, domain_object, external_version_slug = self.unresolve_domain( domain ) - if not project_slug: + if not parent_project_slug: return None - parent_project = Project.objects.filter(slug=project_slug).first() + parent_project = Project.objects.filter(slug=parent_project_slug).first() if not parent_project: return None - project, version, filename = self._unresolve_path( + current_project, version, filename = self._unresolve_path( parent_project=parent_project, path=parsed.path, ) @@ -109,7 +114,7 @@ def unresolve(self, url, add_index=True): return UnresolvedURL( parent_project=parent_project, - project=project or parent_project, + project=current_project or parent_project, version=version, filename=filename, parsed_url=parsed, @@ -131,6 +136,10 @@ def _match_multiversion_project(self, parent_project, path): If the translation exists, we return a result even if the version doesn't, so the translation is taken as the current project (useful for 404 pages). + + :returns: None or a tuple with the current project, version and file. + A tuple with only the project means we weren't able to find a version, + but the translation was correct. """ match = self.multiversion_pattern.match(path) if not match: @@ -162,15 +171,19 @@ def _match_subproject(self, parent_project, path): If the subproject exists, we return a result even if version doesn't, so the subproject is taken as the current project (useful for 404 pages). + + :returns: None or a tuple with the current project, version and file. + A tuple with only the project means we were able to find the subproject, + but we weren't able to resolve the rest of the path. """ match = self.subproject_pattern.match(path) if not match: return None - project_slug = match.group("project") + subproject_alias = match.group("project") file = self._normalize_filename(match.group("file")) project_relationship = ( - parent_project.subprojects.filter(alias=project_slug) + parent_project.subprojects.filter(alias=subproject_alias) .prefetch_related("child") .first() ) @@ -215,10 +228,19 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True): If the returned version is `None`, then we weren't able to unresolve the path into a valid version of the project. + The checks are done in the following order: + + - Check for multiple versions if the parent project + isn't a single version project. + - Check for subprojects. + - Check for single versions if the parent project isn’t + a multi version project. + :param parent_project: The project that owns the path. :param path: The path to unresolve. :param check_subprojects: If we should check for subprojects, - this is used to call this function recursively. + this is used to call this function recursively when + resolving the path from a subproject (we don't support subprojects of subprojects). :returns: A tuple with: project, version, and file name. """ From f2d3fdfe7469e93d846d631863ea0ebc7ccf1cc1 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 14 Sep 2022 13:57:30 -0500 Subject: [PATCH 20/21] Rename --- readthedocs/core/unresolver.py | 6 +++--- readthedocs/rtd_tests/tests/test_unresolver.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index d4fc57eb12e..9ff49b06e75 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -60,7 +60,7 @@ class Unresolver: re.VERBOSE, ) - def unresolve(self, url, add_index=True): + def unresolve(self, url, append_indexhtml=True): """ Turn a URL into the component parts that our views would use to process them. @@ -68,7 +68,7 @@ def unresolve(self, url, add_index=True): like where we want to figure out exactly what file a URL maps to. :param url: Full URL to unresolve (including the protocol and domain part). - :param add_index: If `True` the filename will be normalized + :param append_indexhtml: If `True` directories will be normalized to end with ``/index.html``. """ parsed = urlparse(url) @@ -109,7 +109,7 @@ def unresolve(self, url, add_index=True): version = None filename = None - if add_index and filename and filename.endswith("/"): + if append_indexhtml and filename and filename.endswith("/"): filename += "index.html" return UnresolvedURL( diff --git a/readthedocs/rtd_tests/tests/test_unresolver.py b/readthedocs/rtd_tests/tests/test_unresolver.py index 63d7d182e75..063d3620f8e 100644 --- a/readthedocs/rtd_tests/tests/test_unresolver.py +++ b/readthedocs/rtd_tests/tests/test_unresolver.py @@ -29,7 +29,7 @@ def test_unresolver(self): self.assertEqual(parts.parsed_url.query, "search=api") def test_filename_wihtout_index(self): - parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False) + parts = unresolve("https://pip.readthedocs.io/en/latest/file/", append_indexhtml=False) self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version) From 80574f3894e0df7dbc5ce82dfd7e9f9e2adb7ce1 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 14 Sep 2022 14:01:59 -0500 Subject: [PATCH 21/21] Black --- readthedocs/core/unresolver.py | 16 +++++++++++----- readthedocs/rtd_tests/tests/test_unresolver.py | 4 +++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/readthedocs/core/unresolver.py b/readthedocs/core/unresolver.py index 9ff49b06e75..26f80d6bd29 100644 --- a/readthedocs/core/unresolver.py +++ b/readthedocs/core/unresolver.py @@ -43,7 +43,9 @@ class Unresolver: r""" ^/(?P{lang_slug}) # Must have the language slug. (/((?P{version_slug})(/(?P{filename_slug}))?)?)?$ # Optionally a version followed by a file. # noqa - """.format(**pattern_opts), + """.format( + **pattern_opts + ), re.VERBOSE, ) @@ -56,7 +58,9 @@ class Unresolver: ^/projects/ # Must have the `projects` prefix. (?P{project_slug}+) # Followed by the subproject alias. (/(?P{filename_slug}))?$ # Optionally a filename, which will be recursively resolved. - """.format(**pattern_opts), + """.format( + **pattern_opts + ), re.VERBOSE, ) @@ -73,9 +77,11 @@ def unresolve(self, url, append_indexhtml=True): """ parsed = urlparse(url) domain = self.get_domain_from_host(parsed.netloc) - parent_project_slug, domain_object, external_version_slug = self.unresolve_domain( - domain - ) + ( + parent_project_slug, + domain_object, + external_version_slug, + ) = self.unresolve_domain(domain) if not parent_project_slug: return None diff --git a/readthedocs/rtd_tests/tests/test_unresolver.py b/readthedocs/rtd_tests/tests/test_unresolver.py index 063d3620f8e..06b834226c3 100644 --- a/readthedocs/rtd_tests/tests/test_unresolver.py +++ b/readthedocs/rtd_tests/tests/test_unresolver.py @@ -29,7 +29,9 @@ def test_unresolver(self): self.assertEqual(parts.parsed_url.query, "search=api") def test_filename_wihtout_index(self): - parts = unresolve("https://pip.readthedocs.io/en/latest/file/", append_indexhtml=False) + parts = unresolve( + "https://pip.readthedocs.io/en/latest/file/", append_indexhtml=False + ) self.assertEqual(parts.parent_project, self.pip) self.assertEqual(parts.project, self.pip) self.assertEqual(parts.version, self.version)