diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index 94c3da021ca..fd3c2a902c5 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -164,3 +164,11 @@ "html", "json", ) +# Artifacts that expect one and only one file in the output directory. +# NOTE: currently, this is a limitation that we are consider to remove +# https://github.com/readthedocs/readthedocs.org/issues/9931#issuecomment-1403415757 +ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT = ( + "htmlzip", + "epub", + "pdf", +) diff --git a/readthedocs/doc_builder/exceptions.py b/readthedocs/doc_builder/exceptions.py index d3730baa617..b48df350f97 100644 --- a/readthedocs/doc_builder/exceptions.py +++ b/readthedocs/doc_builder/exceptions.py @@ -40,6 +40,18 @@ class BuildUserError(BuildBaseException): BUILD_COMMANDS_WITHOUT_OUTPUT = gettext_noop( f'No "{BUILD_COMMANDS_OUTPUT_PATH_HTML}" folder was created during this build.' ) + BUILD_OUTPUT_IS_NOT_A_DIRECTORY = gettext_noop( + 'Build output directory for format "{artifact_type}" is not a directory.' + ) + BUILD_OUTPUT_HAS_0_FILES = gettext_noop( + 'Build output directory for format "{artifact_type}" does not contain any files. ' + "It seems the build process created the directory but did not save any file to it." + ) + BUILD_OUTPUT_HAS_MULTIPLE_FILES = gettext_noop( + 'Build output directory for format "{artifact_type}" contains multiple files ' + "and it is not currently supported. " + 'Please, remove all the files but the "{artifact_type}" your want to upload.' + ) class BuildUserSkip(BuildUserError): diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index ed4707f16ae..071a1bf1683 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -19,6 +19,7 @@ from readthedocs.builds import tasks as build_tasks from readthedocs.builds.constants import ( ARTIFACT_TYPES, + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT, BUILD_FINAL_STATES, BUILD_STATE_BUILDING, BUILD_STATE_CLONING, @@ -494,7 +495,6 @@ def on_failure(self, exc, task_id, args, kwargs, einfo): if self.data.version: version_type = self.data.version.type - # NOTE: autoflake gets confused here. We need the NOQA for now. status = BUILD_STATUS_FAILURE if isinstance(exc, BuildUserSkip): # The build was skipped by returning the magic exit code, @@ -513,55 +513,91 @@ def on_failure(self, exc, task_id, args, kwargs, einfo): # Update build object self.data.build['success'] = False - def on_success(self, retval, task_id, args, kwargs): - time_before_store_build_artifacts = timezone.now() - # Store build artifacts to storage (local or cloud storage) - # - # TODO: move ``store_build_artifacts`` inside ``execute`` so we can - # handle exceptions properly on ``on_failure`` - self.store_build_artifacts() - log.info( - "Store build artifacts finished.", - time=(timezone.now() - time_before_store_build_artifacts).seconds, - ) + def get_valid_artifact_types(self): + """ + Return a list of all the valid artifact types for this build. - # HTML are built successfully. - html = os.path.exists( - self.data.project.artifact_path( - version=self.data.version.slug, type_="html" - ) - ) - localmedia = os.path.exists( - self.data.project.artifact_path( - version=self.data.version.slug, type_="htmlzip" - ) - ) - pdf = os.path.exists( - self.data.project.artifact_path(version=self.data.version.slug, type_="pdf") - ) - epub = os.path.exists( - self.data.project.artifact_path( - version=self.data.version.slug, type_="epub" + It performs the following checks on each output format type path: + - it exists + - it is a directory + - does not contains more than 1 files (only PDF, HTMLZip, ePUB) + + TODO: remove the limitation of only 1 file. + Add support for multiple PDF files in the output directory and + grab them by using glob syntaxt between other files that could be garbage. + """ + valid_artifacts = [] + for artifact_type in ARTIFACT_TYPES: + artifact_directory = self.data.project.artifact_path( + version=self.data.version.slug, + type_=artifact_type, ) - ) + if not os.path.exists(artifact_directory): + # There is no output directory. + # Skip this format. + continue + + if not os.path.isdir(artifact_directory): + log.error( + "The output path is not a directory.", + output_format=artifact_type, + ) + raise BuildUserError( + BuildUserError.BUILD_OUTPUT_IS_NOT_A_DIRECTORY.format( + artifact_type=artifact_type + ) + ) + + # Check if there are multiple files on artifact directories. + # These output format does not support multiple files yet. + # In case multiple files are found, the upload for this format is not performed. + if artifact_type in ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT: + artifact_format_files = len(os.listdir(artifact_directory)) + if artifact_format_files > 1: + log.error( + "Multiple files are not supported for this format. " + "Skipping this output format.", + output_format=artifact_type, + ) + raise BuildUserError( + BuildUserError.BUILD_OUTPUT_HAS_MULTIPLE_FILES.format( + artifact_type=artifact_type + ) + ) + elif artifact_format_files == 0: + raise BuildUserError( + BuildUserError.BUILD_OUTPUT_HAS_0_FILES.format( + artifact_type=artifact_type + ) + ) + + # If all the conditions were met, the artifact is valid + valid_artifacts.append(artifact_type) + + return valid_artifacts + + def on_success(self, retval, task_id, args, kwargs): + valid_artifacts = self.get_valid_artifact_types() # NOTE: we are updating the db version instance *only* when - if html: + # TODO: remove this condition and *always* update the DB Version instance + if "html" in valid_artifacts: try: api_v2.version(self.data.version.pk).patch( { "built": True, "documentation_type": self.data.version.documentation_type, - "has_pdf": pdf, - "has_epub": epub, - "has_htmlzip": localmedia, + "has_pdf": "pdf" in valid_artifacts, + "has_epub": "epub" in valid_artifacts, + "has_htmlzip": "htmlzip" in valid_artifacts, } ) except HttpClientError: # NOTE: I think we should fail the build if we cannot update # the version at this point. Otherwise, we will have inconsistent data log.exception( - 'Updating version failed, skipping file sync.', + "Updating version db object failed. " + 'Files are synced in the storage, but "Version" object is not updated', ) # Index search data @@ -658,10 +694,13 @@ def update_build(self, state=None): try: api_v2.build(self.data.build['id']).patch(self.data.build) except Exception: - # NOTE: I think we should fail the build if we cannot update it - # at this point otherwise, the data will be inconsistent and we - # may be serving "new docs" but saying the "build have failed" - log.exception('Unable to update build') + # NOTE: we are updating the "Build" object on each `state`. + # Only if the last update fails, there may be some inconsistency + # between the "Build" object in our db and the reality. + # + # The `state` argument will help us to track this more and understand + # at what state our updates are failing and decide what to do. + log.exception("Error while updating the build object.", state=state) def execute(self): self.data.build_director = BuildDirector( @@ -709,6 +748,12 @@ def execute(self): finally: self.data.build_data = self.collect_build_data() + # At this point, the user's build already succeeded. + # However, we cannot use `.on_success()` because we still have to upload the artifacts; + # which could fail, and we want to detect that and handle it properly at `.on_failure()` + # Store build artifacts to storage (local or cloud storage) + self.store_build_artifacts() + def collect_build_data(self): """ Collect data from the current build. @@ -786,53 +831,30 @@ def store_build_artifacts(self): Remove build artifacts of types not included in this build (PDF, ePub, zip only). """ + time_before_store_build_artifacts = timezone.now() log.info('Writing build artifacts to media storage') - # NOTE: I don't remember why we removed this state from the Build - # object. I'm re-adding it because I think it's useful, but we can - # remove it if we want self.update_build(state=BUILD_STATE_UPLOADING) + valid_artifacts = self.get_valid_artifact_types() + log.bind(artifacts=valid_artifacts) + types_to_copy = [] types_to_delete = [] for artifact_type in ARTIFACT_TYPES: - if os.path.exists( - self.data.project.artifact_path( - version=self.data.version.slug, - type_=artifact_type, - ) - ): + if artifact_type in valid_artifacts: types_to_copy.append(artifact_type) # Never delete HTML nor JSON (search index) elif artifact_type not in UNDELETABLE_ARTIFACT_TYPES: types_to_delete.append(artifact_type) + # Upload formats for media_type in types_to_copy: - # NOTE: here is where we get the correct FROM path to upload - from_path = self.data.version.project.artifact_path( + from_path = self.data.project.artifact_path( version=self.data.version.slug, type_=media_type, ) - - # Check if there are multiple files on source directories. - # These output format does not support multiple files yet. - # In case multiple files are found, the upload for this format is not performed. - # - # TODO: we should fail the build for these cases and clearly communicate this. - # to the user. To do this, we need to call this method (``store_build_artifacts``) - # since the ``execute`` method. - # It will allow us to just `raise BuildUserError` and handle it at - # Celery `on_failure` handler. - if media_type in ("htmlzip", "epub", "pdf"): - if len(os.listdir(from_path)) > 1: - log.exception( - "Multiple files are not supported for this format. " - "Skipping this output format.", - output_format=media_type, - ) - continue - - to_path = self.data.version.project.get_storage_path( + to_path = self.data.project.get_storage_path( type_=media_type, version_slug=self.data.version.slug, include_file=False, @@ -841,15 +863,22 @@ def store_build_artifacts(self): try: build_media_storage.sync_directory(from_path, to_path) except Exception: - # Ideally this should just be an IOError - # but some storage backends unfortunately throw other errors + # NOTE: the exceptions reported so far are: + # - botocore.exceptions:HTTPClientError + # - botocore.exceptions:ClientError + # - readthedocs.doc_builder.exceptions:BuildCancelled log.exception( - 'Error copying to storage (not failing build)', + "Error copying to storage", media_type=media_type, from_path=from_path, to_path=to_path, ) + # Re-raise the exception to fail the build and handle it + # automatically at `on_failure`. + # It will clearly communicate the error to the user. + raise BuildAppError("Error uploading files to the storage.") + # Delete formats for media_type in types_to_delete: media_path = self.data.version.project.get_storage_path( type_=media_type, @@ -860,13 +889,21 @@ def store_build_artifacts(self): try: build_media_storage.delete_directory(media_path) except Exception: - # Ideally this should just be an IOError - # but some storage backends unfortunately throw other errors + # NOTE: I didn't find any log line for this case yet log.exception( - 'Error deleting from storage (not failing build)', + "Error deleting files from storage", media_type=media_type, media_path=media_path, ) + # Re-raise the exception to fail the build and handle it + # automatically at `on_failure`. + # It will clearly communicate the error to the user. + raise BuildAppError("Error deleting files from storage.") + + log.info( + "Store build artifacts finished.", + time=(timezone.now() - time_before_store_build_artifacts).seconds, + ) def send_notifications(self, version_pk, build_pk, event): """Send notifications to all subscribers of `event`.""" diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index cffc3a6db72..f5795cc1894 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -1,4 +1,5 @@ import os +import pathlib from unittest import mock import django_dynamic_fixture as fixture @@ -200,8 +201,14 @@ def test_build_updates_documentation_type(self, load_yaml_config): # Create the artifact paths, so that `store_build_artifacts` # properly runs: https://github.com/readthedocs/readthedocs.org/blob/faa611fad689675f81101ea643770a6b669bf529/readthedocs/projects/tasks/builds.py#L798-L804 os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="epub")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="pdf")) + for f in ("epub", "pdf"): + os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f)) + pathlib.Path( + os.path.join( + self.project.artifact_path(version=self.version.slug, type_=f), + f"{self.project.slug}.{f}", + ) + ).touch() self._trigger_update_docs_task() @@ -322,11 +329,14 @@ def test_successful_build( # Create the artifact paths, so it's detected by the builder os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html")) os.makedirs(self.project.artifact_path(version=self.version.slug, type_="json")) - os.makedirs( - self.project.artifact_path(version=self.version.slug, type_="htmlzip") - ) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="epub")) - os.makedirs(self.project.artifact_path(version=self.version.slug, type_="pdf")) + for f in ("htmlzip", "epub", "pdf"): + os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f)) + pathlib.Path( + os.path.join( + self.project.artifact_path(version=self.version.slug, type_=f), + f"{self.project.slug}.{f}", + ) + ).touch() self._trigger_update_docs_task()