diff --git a/src/sagemaker/algorithm.py b/src/sagemaker/algorithm.py index 300227bca4..a55635b1c3 100644 --- a/src/sagemaker/algorithm.py +++ b/src/sagemaker/algorithm.py @@ -174,7 +174,7 @@ def __init__( self.validate_train_spec() self.hyperparameter_definitions = self._parse_hyperparameters() - self.hyperparam_dict = {} + self._hyperparameters = {} if hyperparameters: self.set_hyperparameters(**hyperparameters) @@ -215,7 +215,7 @@ def set_hyperparameters(self, **kwargs): """Placeholder docstring""" for k, v in kwargs.items(): value = self._validate_and_cast_hyperparameter(k, v) - self.hyperparam_dict[k] = value + self._hyperparameters[k] = value self._validate_and_set_default_hyperparameters() @@ -225,7 +225,7 @@ def hyperparameters(self): The fit() method, that does the model training, calls this method to find the hyperparameters you specified. """ - return self.hyperparam_dict + return self._hyperparameters def training_image_uri(self): """Returns the docker image to use for training. @@ -464,10 +464,10 @@ def _validate_and_set_default_hyperparameters(self): # Check if all the required hyperparameters are set. If there is a default value # for one, set it. for name, definition in self.hyperparameter_definitions.items(): - if name not in self.hyperparam_dict: + if name not in self._hyperparameters: spec = definition["spec"] if "DefaultValue" in spec: - self.hyperparam_dict[name] = spec["DefaultValue"] + self._hyperparameters[name] = spec["DefaultValue"] elif "IsRequired" in spec and spec["IsRequired"]: raise ValueError("Required hyperparameter: %s is not set" % name) diff --git a/src/sagemaker/chainer/estimator.py b/src/sagemaker/chainer/estimator.py index b99cad911f..899ef62f63 100644 --- a/src/sagemaker/chainer/estimator.py +++ b/src/sagemaker/chainer/estimator.py @@ -15,7 +15,7 @@ import logging -from sagemaker.estimator import Framework +from sagemaker.estimator import Framework, EstimatorBase from sagemaker.fw_utils import ( framework_name_from_image, framework_version_from_tag, @@ -158,7 +158,9 @@ def hyperparameters(self): # remove unset keys. additional_hyperparameters = {k: v for k, v in additional_hyperparameters.items() if v} - hyperparameters.update(Framework._json_encode_hyperparameters(additional_hyperparameters)) + hyperparameters.update( + EstimatorBase._json_encode_hyperparameters(additional_hyperparameters) + ) return hyperparameters def create_model( diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index ddf6f107ed..8a867e5935 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -16,6 +16,7 @@ import json import logging import os +from typing import Any, Dict import uuid from abc import ABCMeta, abstractmethod @@ -86,6 +87,15 @@ class EstimatorBase(with_metaclass(ABCMeta, object)): # pylint: disable=too-man instance. """ + LAUNCH_PS_ENV_NAME = "sagemaker_parameter_server_enabled" + LAUNCH_MPI_ENV_NAME = "sagemaker_mpi_enabled" + LAUNCH_SM_DDP_ENV_NAME = "sagemaker_distributed_dataparallel_enabled" + INSTANCE_TYPE = "sagemaker_instance_type" + MPI_NUM_PROCESSES_PER_HOST = "sagemaker_mpi_num_of_processes_per_host" + MPI_CUSTOM_MPI_OPTIONS = "sagemaker_mpi_custom_mpi_options" + SM_DDP_CUSTOM_MPI_OPTIONS = "sagemaker_distributed_dataparallel_custom_mpi_options" + CONTAINER_CODE_CHANNEL_SOURCEDIR_PATH = "/opt/ml/input/data/code/sourcedir.tar.gz" + def __init__( self, role, @@ -119,6 +129,13 @@ def __init__( disable_profiler=False, environment=None, max_retry_attempts=None, + source_dir=None, + git_config=None, + hyperparameters=None, + container_log_level=logging.INFO, + code_location=None, + entry_point=None, + dependencies=None, **kwargs, ): """Initialize an ``EstimatorBase`` instance. @@ -270,13 +287,133 @@ def __init__( will be disabled (default: ``False``). environment (dict[str, str]) : Environment variables to be set for use during training job (default: ``None``) - max_retry_attempts (int): The number of times to move a job to the STARTING status. + max_retry_attempts (int): The number of times to move a job to the STARTING status. You can specify between 1 and 30 attempts. If the value of attempts is greater than zero, the job is retried on InternalServerFailure the same number of attempts as the value. You can cap the total duration for your job by setting ``max_wait`` and ``max_run`` (default: ``None``) + source_dir (str): Path (absolute, relative or an S3 URI) to a directory + with any other training source code dependencies aside from the entry + point file (default: None). If ``source_dir`` is an S3 URI, it must + point to a tar.gz file. Structure within this directory are preserved + when training on Amazon SageMaker. If 'git_config' is provided, + 'source_dir' should be a relative location to a directory in the Git + repo. + + .. admonition:: Example + + With the following GitHub repo directory structure: + + >>> |----- README.md + >>> |----- src + >>> |----- train.py + >>> |----- test.py + + and you need 'train.py' as entry point and 'test.py' as + training source code as well, you can assign + entry_point='train.py', source_dir='src'. + git_config (dict[str, str]): Git configurations used for cloning + files, including ``repo``, ``branch``, ``commit``, + ``2FA_enabled``, ``username``, ``password`` and ``token``. The + ``repo`` field is required. All other fields are optional. + ``repo`` specifies the Git repository where your training script + is stored. If you don't provide ``branch``, the default value + 'master' is used. If you don't provide ``commit``, the latest + commit in the specified branch is used. .. admonition:: Example + + The following config: + + >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git', + >>> 'branch': 'test-branch-git-config', + >>> 'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'} + + results in cloning the repo specified in 'repo', then + checkout the 'master' branch, and checkout the specified + commit. + + ``2FA_enabled``, ``username``, ``password`` and ``token`` are + used for authentication. For GitHub (or other Git) accounts, set + ``2FA_enabled`` to 'True' if two-factor authentication is + enabled for the account, otherwise set it to 'False'. If you do + not provide a value for ``2FA_enabled``, a default value of + 'False' is used. CodeCommit does not support two-factor + authentication, so do not provide "2FA_enabled" with CodeCommit + repositories. + + For GitHub and other Git repos, when SSH URLs are provided, it + doesn't matter whether 2FA is enabled or disabled; you should + either have no passphrase for the SSH key pairs, or have the + ssh-agent configured so that you will not be prompted for SSH + passphrase when you do 'git clone' command with SSH URLs. When + HTTPS URLs are provided: if 2FA is disabled, then either token + or username+password will be used for authentication if provided + (token prioritized); if 2FA is enabled, only token will be used + for authentication if provided. If required authentication info + is not provided, python SDK will try to use local credentials + storage to authenticate. If that fails either, an error message + will be thrown. + + For CodeCommit repos, 2FA is not supported, so '2FA_enabled' + should not be provided. There is no token in CodeCommit, so + 'token' should not be provided too. When 'repo' is an SSH URL, + the requirements are the same as GitHub-like repos. When 'repo' + is an HTTPS URL, username+password will be used for + authentication if they are provided; otherwise, python SDK will + try to use either CodeCommit credential helper or local + credential storage for authentication. + hyperparameters (dict): Dictionary containing the hyperparameters to + initialize this estimator with. (Default: None). + container_log_level (int): Log level to use within the container + (default: logging.INFO). Valid values are defined in the Python + logging module. + code_location (str): The S3 prefix URI where custom code will be + uploaded (default: None) - don't include a trailing slash since + a string prepended with a "/" is appended to ``code_location``. The code + file uploaded to S3 is 'code_location/job-name/source/sourcedir.tar.gz'. + If not specified, the default ``code location`` is s3://output_bucket/job-name/. + entry_point (str): Path (absolute or relative) to the local Python + source file which should be executed as the entry point to + training. (Default: None). If ``source_dir`` is specified, then ``entry_point`` + must point to a file located at the root of ``source_dir``. + If 'git_config' is provided, 'entry_point' should be + a relative location to the Python source file in the Git repo. + + Example: + With the following GitHub repo directory structure: + + >>> |----- README.md + >>> |----- src + >>> |----- train.py + >>> |----- test.py + + You can assign entry_point='src/train.py'. + dependencies (list[str]): A list of paths to directories (absolute + or relative) with any additional libraries that will be exported + to the container (default: []). The library folders will be + copied to SageMaker in the same folder where the entrypoint is + copied. If 'git_config' is provided, 'dependencies' should be a + list of relative locations to directories with any additional + libraries needed in the Git repo. + + .. admonition:: Example + + The following call + + >>> Estimator(entry_point='train.py', + ... dependencies=['my/libs/common', 'virtual-env']) + + results in the following inside the container: + + >>> $ ls + + >>> opt/ml/code + >>> |------ train.py + >>> |------ common + >>> |------ virtual-env + + This is not supported with "local code" in Local Mode. """ instance_count = renamed_kwargs( @@ -311,6 +448,14 @@ def __init__( self.model_channel_name = model_channel_name self.code_uri = None self.code_channel_name = "code" + self.source_dir = source_dir + self.git_config = git_config + self.container_log_level = container_log_level + self._hyperparameters = hyperparameters.copy() if hyperparameters else {} + self.code_location = code_location + self.entry_point = entry_point + self.dependencies = dependencies + self.uploaded_code = None if self.instance_type in ("local", "local_gpu"): if self.instance_type == "local_gpu" and self.instance_count > 1: @@ -437,6 +582,21 @@ def _get_or_create_name(self, name=None): self._ensure_base_job_name() return name_from_base(self.base_job_name) + @staticmethod + def _json_encode_hyperparameters(hyperparameters: Dict[str, Any]) -> Dict[str, Any]: + """Applies Json encoding for certain Hyperparameter types, returns hyperparameters. + + Args: + hyperparameters (dict): Dictionary of hyperparameters. + """ + current_hyperparameters = hyperparameters + if current_hyperparameters is not None: + hyperparameters = { + str(k): (v if isinstance(v, (Parameter, Expression, Properties)) else json.dumps(v)) + for (k, v) in current_hyperparameters.items() + } + return hyperparameters + def _prepare_for_training(self, job_name=None): """Set any values in the estimator that need to be set before training. @@ -456,10 +616,105 @@ def _prepare_for_training(self, job_name=None): else: self.output_path = "s3://{}/".format(self.sagemaker_session.default_bucket()) + if self.git_config: + updated_paths = git_utils.git_clone_repo( + self.git_config, self.entry_point, self.source_dir, self.dependencies + ) + self.entry_point = updated_paths["entry_point"] + self.source_dir = updated_paths["source_dir"] + self.dependencies = updated_paths["dependencies"] + + if self.source_dir or self.entry_point or self.dependencies: + + # validate source dir will raise a ValueError if there is something wrong with + # the source directory. We are intentionally not handling it because this is a + # critical error. + if self.source_dir and not self.source_dir.lower().startswith("s3://"): + validate_source_dir(self.entry_point, self.source_dir) + + # if we are in local mode with local_code=True. We want the container to just + # mount the source dir instead of uploading to S3. + local_code = get_config_value("local.local_code", self.sagemaker_session.config) + + if self.sagemaker_session.local_mode and local_code: + # if there is no source dir, use the directory containing the entry point. + if self.source_dir is None: + self.source_dir = os.path.dirname(self.entry_point) + self.entry_point = os.path.basename(self.entry_point) + + code_dir = "file://" + self.source_dir + script = self.entry_point + elif self.enable_network_isolation() and self.entry_point: + self.uploaded_code = self._stage_user_code_in_s3() + code_dir = self.CONTAINER_CODE_CHANNEL_SOURCEDIR_PATH + script = self.uploaded_code.script_name + self.code_uri = self.uploaded_code.s3_prefix + else: + self.uploaded_code = self._stage_user_code_in_s3() + code_dir = self.uploaded_code.s3_prefix + script = self.uploaded_code.script_name + + # Modify hyperparameters in-place to point to the right code directory and + # script URIs + self._script_mode_hyperparam_update(code_dir, script) + self._prepare_rules() self._prepare_debugger_for_training() self._prepare_profiler_for_training() + def _script_mode_hyperparam_update(self, code_dir: str, script: str) -> None: + """Applies in-place update to hyperparameters required for script mode with training. + + Args: + code_dir (str): The directory hosting the training scripts. + script (str): The relative filepath of the training entry-point script. + """ + hyperparams: Dict[str, str] = {} + hyperparams[DIR_PARAM_NAME] = code_dir + hyperparams[SCRIPT_PARAM_NAME] = script + hyperparams[CONTAINER_LOG_LEVEL_PARAM_NAME] = self.container_log_level + hyperparams[JOB_NAME_PARAM_NAME] = self._current_job_name + hyperparams[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_region_name + + self._hyperparameters.update(EstimatorBase._json_encode_hyperparameters(hyperparams)) + + def _stage_user_code_in_s3(self) -> str: + """Upload the user training script to s3 and return the s3 URI. + + Returns: s3 uri + """ + local_mode = self.output_path.startswith("file://") + + if self.code_location is None and local_mode: + code_bucket = self.sagemaker_session.default_bucket() + code_s3_prefix = "{}/{}".format(self._current_job_name, "source") + kms_key = None + elif self.code_location is None: + code_bucket, _ = parse_s3_url(self.output_path) + code_s3_prefix = "{}/{}".format(self._current_job_name, "source") + kms_key = self.output_kms_key + elif local_mode: + code_bucket, key_prefix = parse_s3_url(self.code_location) + code_s3_prefix = "/".join(filter(None, [key_prefix, self._current_job_name, "source"])) + kms_key = None + else: + code_bucket, key_prefix = parse_s3_url(self.code_location) + code_s3_prefix = "/".join(filter(None, [key_prefix, self._current_job_name, "source"])) + + output_bucket, _ = parse_s3_url(self.output_path) + kms_key = self.output_kms_key if code_bucket == output_bucket else None + + return tar_and_upload_dir( + session=self.sagemaker_session.boto_session, + bucket=code_bucket, + s3_key_prefix=code_s3_prefix, + script=self.entry_point, + directory=self.source_dir, + dependencies=self.dependencies, + kms_key=kms_key, + s3_resource=self.sagemaker_session.s3_resource, + ) + def _prepare_rules(self): """Rules list includes both debugger and profiler rules. @@ -1719,6 +1974,12 @@ def __init__( disable_profiler=False, environment=None, max_retry_attempts=None, + source_dir=None, + git_config=None, + container_log_level=logging.INFO, + code_location=None, + entry_point=None, + dependencies=None, **kwargs, ): """Initialize an ``Estimator`` instance. @@ -1876,9 +2137,127 @@ def __init__( the same number of attempts as the value. You can cap the total duration for your job by setting ``max_wait`` and ``max_run`` (default: ``None``) + source_dir (str): Path (absolute, relative or an S3 URI) to a directory + with any other training source code dependencies aside from the entry + point file (default: None). If ``source_dir`` is an S3 URI, it must + point to a tar.gz file. Structure within this directory are preserved + when training on Amazon SageMaker. If 'git_config' is provided, + 'source_dir' should be a relative location to a directory in the Git + repo. + + .. admonition:: Example + + With the following GitHub repo directory structure: + + >>> |----- README.md + >>> |----- src + >>> |----- train.py + >>> |----- test.py + + and you need 'train.py' as entry point and 'test.py' as + training source code as well, you can assign + entry_point='train.py', source_dir='src'. + git_config (dict[str, str]): Git configurations used for cloning + files, including ``repo``, ``branch``, ``commit``, + ``2FA_enabled``, ``username``, ``password`` and ``token``. The + ``repo`` field is required. All other fields are optional. + ``repo`` specifies the Git repository where your training script + is stored. If you don't provide ``branch``, the default value + 'master' is used. If you don't provide ``commit``, the latest + commit in the specified branch is used. .. admonition:: Example + + The following config: + + >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git', + >>> 'branch': 'test-branch-git-config', + >>> 'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'} + + results in cloning the repo specified in 'repo', then + checkout the 'master' branch, and checkout the specified + commit. + + ``2FA_enabled``, ``username``, ``password`` and ``token`` are + used for authentication. For GitHub (or other Git) accounts, set + ``2FA_enabled`` to 'True' if two-factor authentication is + enabled for the account, otherwise set it to 'False'. If you do + not provide a value for ``2FA_enabled``, a default value of + 'False' is used. CodeCommit does not support two-factor + authentication, so do not provide "2FA_enabled" with CodeCommit + repositories. + + For GitHub and other Git repos, when SSH URLs are provided, it + doesn't matter whether 2FA is enabled or disabled; you should + either have no passphrase for the SSH key pairs, or have the + ssh-agent configured so that you will not be prompted for SSH + passphrase when you do 'git clone' command with SSH URLs. When + HTTPS URLs are provided: if 2FA is disabled, then either token + or username+password will be used for authentication if provided + (token prioritized); if 2FA is enabled, only token will be used + for authentication if provided. If required authentication info + is not provided, python SDK will try to use local credentials + storage to authenticate. If that fails either, an error message + will be thrown. + + For CodeCommit repos, 2FA is not supported, so '2FA_enabled' + should not be provided. There is no token in CodeCommit, so + 'token' should not be provided too. When 'repo' is an SSH URL, + the requirements are the same as GitHub-like repos. When 'repo' + is an HTTPS URL, username+password will be used for + authentication if they are provided; otherwise, python SDK will + try to use either CodeCommit credential helper or local + credential storage for authentication. + container_log_level (int): Log level to use within the container + (default: logging.INFO). Valid values are defined in the Python + logging module. + code_location (str): The S3 prefix URI where custom code will be + uploaded (default: None) - don't include a trailing slash since + a string prepended with a "/" is appended to ``code_location``. The code + file uploaded to S3 is 'code_location/job-name/source/sourcedir.tar.gz'. + If not specified, the default ``code location`` is s3://output_bucket/job-name/. + entry_point (str): Path (absolute or relative) to the local Python + source file which should be executed as the entry point to + training. If ``source_dir`` is specified, then ``entry_point`` + must point to a file located at the root of ``source_dir``. + If 'git_config' is provided, 'entry_point' should be + a relative location to the Python source file in the Git repo. + + Example: + With the following GitHub repo directory structure: + + >>> |----- README.md + >>> |----- src + >>> |----- train.py + >>> |----- test.py + + You can assign entry_point='src/train.py'. + dependencies (list[str]): A list of paths to directories (absolute + or relative) with any additional libraries that will be exported + to the container (default: []). The library folders will be + copied to SageMaker in the same folder where the entrypoint is + copied. If 'git_config' is provided, 'dependencies' should be a + list of relative locations to directories with any additional + libraries needed in the Git repo. + + .. admonition:: Example + + The following call + + >>> Estimator(entry_point='train.py', + ... dependencies=['my/libs/common', 'virtual-env']) + + results in the following inside the container: + + >>> $ ls + + >>> opt/ml/code + >>> |------ train.py + >>> |------ common + >>> |------ virtual-env + + This is not supported with "local code" in Local Mode. """ self.image_uri = image_uri - self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {} + self._hyperparameters = hyperparameters.copy() if hyperparameters else {} super(Estimator, self).__init__( role, instance_count, @@ -1911,6 +2290,13 @@ def __init__( disable_profiler=disable_profiler, environment=environment, max_retry_attempts=max_retry_attempts, + container_log_level=container_log_level, + source_dir=source_dir, + git_config=git_config, + code_location=code_location, + entry_point=entry_point, + dependencies=dependencies, + hyperparameters=hyperparameters, **kwargs, ) @@ -1931,7 +2317,7 @@ def set_hyperparameters(self, **kwargs): training. """ for k, v in kwargs.items(): - self.hyperparam_dict[k] = v + self._hyperparameters[k] = v def hyperparameters(self): """Returns the hyperparameters as a dictionary to use for training. @@ -1939,7 +2325,7 @@ def hyperparameters(self): The fit() method, that does the model training, calls this method to find the hyperparameters you specified. """ - return self.hyperparam_dict + return self._hyperparameters def create_model( self, @@ -2015,15 +2401,6 @@ class Framework(EstimatorBase): _framework_name = None - LAUNCH_PS_ENV_NAME = "sagemaker_parameter_server_enabled" - LAUNCH_MPI_ENV_NAME = "sagemaker_mpi_enabled" - LAUNCH_SM_DDP_ENV_NAME = "sagemaker_distributed_dataparallel_enabled" - INSTANCE_TYPE = "sagemaker_instance_type" - MPI_NUM_PROCESSES_PER_HOST = "sagemaker_mpi_num_of_processes_per_host" - MPI_CUSTOM_MPI_OPTIONS = "sagemaker_mpi_custom_mpi_options" - SM_DDP_CUSTOM_MPI_OPTIONS = "sagemaker_distributed_dataparallel_custom_mpi_options" - CONTAINER_CODE_CHANNEL_SOURCEDIR_PATH = "/opt/ml/input/data/code/sourcedir.tar.gz" - def __init__( self, entry_point, @@ -2237,48 +2614,23 @@ def _prepare_for_training(self, job_name=None): """ super(Framework, self)._prepare_for_training(job_name=job_name) - if self.git_config: - updated_paths = git_utils.git_clone_repo( - self.git_config, self.entry_point, self.source_dir, self.dependencies - ) - self.entry_point = updated_paths["entry_point"] - self.source_dir = updated_paths["source_dir"] - self.dependencies = updated_paths["dependencies"] + self._validate_and_set_debugger_configs() - # validate source dir will raise a ValueError if there is something wrong with the - # source directory. We are intentionally not handling it because this is a critical error. - if self.source_dir and not self.source_dir.lower().startswith("s3://"): - validate_source_dir(self.entry_point, self.source_dir) - - # if we are in local mode with local_code=True. We want the container to just - # mount the source dir instead of uploading to S3. - local_code = get_config_value("local.local_code", self.sagemaker_session.config) - if self.sagemaker_session.local_mode and local_code: - # if there is no source dir, use the directory containing the entry point. - if self.source_dir is None: - self.source_dir = os.path.dirname(self.entry_point) - self.entry_point = os.path.basename(self.entry_point) - - code_dir = "file://" + self.source_dir - script = self.entry_point - elif self.enable_network_isolation() and self.entry_point: - self.uploaded_code = self._stage_user_code_in_s3() - code_dir = self.CONTAINER_CODE_CHANNEL_SOURCEDIR_PATH - script = self.uploaded_code.script_name - self.code_uri = self.uploaded_code.s3_prefix - else: - self.uploaded_code = self._stage_user_code_in_s3() - code_dir = self.uploaded_code.s3_prefix - script = self.uploaded_code.script_name + def _script_mode_hyperparam_update(self, code_dir: str, script: str) -> None: + """Applies in-place update to hyperparameters required for script mode with training. - # Modify hyperparameters in-place to point to the right code directory and script URIs - self._hyperparameters[DIR_PARAM_NAME] = code_dir - self._hyperparameters[SCRIPT_PARAM_NAME] = script - self._hyperparameters[CONTAINER_LOG_LEVEL_PARAM_NAME] = self.container_log_level - self._hyperparameters[JOB_NAME_PARAM_NAME] = self._current_job_name - self._hyperparameters[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_region_name + Args: + code_dir (str): The directory hosting the training scripts. + script (str): The relative filepath of the training entry-point script. + """ + hyperparams: Dict[str, str] = {} + hyperparams[DIR_PARAM_NAME] = code_dir + hyperparams[SCRIPT_PARAM_NAME] = script + hyperparams[CONTAINER_LOG_LEVEL_PARAM_NAME] = self.container_log_level + hyperparams[JOB_NAME_PARAM_NAME] = self._current_job_name + hyperparams[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_region_name - self._validate_and_set_debugger_configs() + self._hyperparameters.update(hyperparams) def _validate_and_set_debugger_configs(self): """Set defaults for debugging.""" @@ -2308,44 +2660,6 @@ def _validate_and_set_debugger_configs(self): self.environment = {} self.environment[DEBUGGER_FLAG] = "0" - def _stage_user_code_in_s3(self): - """Upload the user training script to s3 and return the location. - - Returns: s3 uri - """ - local_mode = self.output_path.startswith("file://") - - if self.code_location is None and local_mode: - code_bucket = self.sagemaker_session.default_bucket() - code_s3_prefix = "{}/{}".format(self._current_job_name, "source") - kms_key = None - elif self.code_location is None: - code_bucket, _ = parse_s3_url(self.output_path) - code_s3_prefix = "{}/{}".format(self._current_job_name, "source") - kms_key = self.output_kms_key - elif local_mode: - code_bucket, key_prefix = parse_s3_url(self.code_location) - code_s3_prefix = "/".join(filter(None, [key_prefix, self._current_job_name, "source"])) - kms_key = None - else: - code_bucket, key_prefix = parse_s3_url(self.code_location) - code_s3_prefix = "/".join(filter(None, [key_prefix, self._current_job_name, "source"])) - - output_bucket, _ = parse_s3_url(self.output_path) - kms_key = self.output_kms_key if code_bucket == output_bucket else None - - return tar_and_upload_dir( - session=self.sagemaker_session.boto_session, - bucket=code_bucket, - s3_key_prefix=code_s3_prefix, - script=self.entry_point, - directory=self.source_dir, - dependencies=self.dependencies, - kms_key=kms_key, - s3_resource=self.sagemaker_session.s3_resource, - settings=self.sagemaker_session.settings, - ) - def _model_source_dir(self): """Get the appropriate value to pass as ``source_dir`` to a model constructor. @@ -2376,6 +2690,10 @@ def _model_entry_point(self): return None + def set_hyperparameters(self, **kwargs): + """Escape the dict argument as JSON, update the private hyperparameter attribute.""" + self._hyperparameters.update(EstimatorBase._json_encode_hyperparameters(kwargs)) + def hyperparameters(self): """Return the hyperparameters as a dictionary to use for training. @@ -2385,7 +2703,7 @@ def hyperparameters(self): Returns: dict[str, str]: The hyperparameters. """ - return self._json_encode_hyperparameters(self._hyperparameters) + return EstimatorBase._json_encode_hyperparameters(self._hyperparameters) @classmethod def _prepare_init_params_from_job_description(cls, job_details, model_channel_name=None): @@ -2504,17 +2822,6 @@ def attach(cls, training_job_name, sagemaker_session=None, model_channel_name="m ) return estimator - @staticmethod - def _json_encode_hyperparameters(hyperparameters): - """Placeholder docstring""" - current_hyperparameters = hyperparameters - if current_hyperparameters is not None: - hyperparameters = { - str(k): (v if isinstance(v, (Parameter, Expression, Properties)) else json.dumps(v)) - for (k, v) in current_hyperparameters.items() - } - return hyperparameters - @classmethod def _update_init_params(cls, hp, tf_arguments): """Placeholder docstring""" diff --git a/src/sagemaker/huggingface/estimator.py b/src/sagemaker/huggingface/estimator.py index 03eb8f496a..9d154d7183 100644 --- a/src/sagemaker/huggingface/estimator.py +++ b/src/sagemaker/huggingface/estimator.py @@ -17,7 +17,7 @@ import re from sagemaker.deprecations import renamed_kwargs -from sagemaker.estimator import Framework +from sagemaker.estimator import Framework, EstimatorBase from sagemaker.fw_utils import ( framework_name_from_image, warn_if_parameter_server_with_multi_gpu, @@ -246,13 +246,13 @@ def hyperparameters(self): distribution=self.distribution ) hyperparameters.update( - Framework._json_encode_hyperparameters(distributed_training_hyperparameters) + EstimatorBase._json_encode_hyperparameters(distributed_training_hyperparameters) ) if self.compiler_config: training_compiler_hyperparameters = self.compiler_config._to_hyperparameter_dict() hyperparameters.update( - Framework._json_encode_hyperparameters(training_compiler_hyperparameters) + EstimatorBase._json_encode_hyperparameters(training_compiler_hyperparameters) ) return hyperparameters diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index c78d786c75..1f822458db 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -399,12 +399,10 @@ def prepare_container_def( ) deploy_env = copy.deepcopy(self.env) if self.source_dir or self.dependencies or self.entry_point or self.git_config: - if self.key_prefix or self.git_config: - self._upload_code(deploy_key_prefix, repack=False) - elif self.source_dir and self.entry_point: - self._upload_code(deploy_key_prefix, repack=True) - else: - self._upload_code(deploy_key_prefix, repack=False) + is_repack = ( + self.source_dir and self.entry_point and not (self.key_prefix or self.git_config) + ) + self._upload_code(deploy_key_prefix, repack=is_repack) deploy_env.update(self._script_mode_env_vars()) return sagemaker.container_def( self.image_uri, self.model_data, deploy_env, image_config=self.image_config diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py index 44d5cfeb98..5807d55365 100644 --- a/src/sagemaker/pytorch/estimator.py +++ b/src/sagemaker/pytorch/estimator.py @@ -18,7 +18,7 @@ from packaging.version import Version from sagemaker.deprecations import renamed_kwargs -from sagemaker.estimator import Framework +from sagemaker.estimator import Framework, EstimatorBase from sagemaker.fw_utils import ( framework_name_from_image, framework_version_from_tag, @@ -192,7 +192,9 @@ def hyperparameters(self): additional_hyperparameters = self._distribution_configuration( distribution=self.distribution ) - hyperparameters.update(Framework._json_encode_hyperparameters(additional_hyperparameters)) + hyperparameters.update( + EstimatorBase._json_encode_hyperparameters(additional_hyperparameters) + ) return hyperparameters def create_model( diff --git a/src/sagemaker/rl/estimator.py b/src/sagemaker/rl/estimator.py index 09f2181516..60307a7868 100644 --- a/src/sagemaker/rl/estimator.py +++ b/src/sagemaker/rl/estimator.py @@ -18,7 +18,7 @@ import re from sagemaker import image_uris, fw_utils -from sagemaker.estimator import Framework +from sagemaker.estimator import Framework, EstimatorBase from sagemaker.model import FrameworkModel, SAGEMAKER_OUTPUT_LOCATION from sagemaker.mxnet.model import MXNetModel from sagemaker.tensorflow.model import TensorFlowModel @@ -340,7 +340,9 @@ def hyperparameters(self): SAGEMAKER_ESTIMATOR: SAGEMAKER_ESTIMATOR_VALUE, } - hyperparameters.update(Framework._json_encode_hyperparameters(additional_hyperparameters)) + hyperparameters.update( + EstimatorBase._json_encode_hyperparameters(additional_hyperparameters) + ) return hyperparameters @classmethod diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 91f34e3010..525486d513 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -19,7 +19,7 @@ from sagemaker import image_uris, s3, utils from sagemaker.deprecations import renamed_kwargs -from sagemaker.estimator import Framework +from sagemaker.estimator import Framework, EstimatorBase import sagemaker.fw_utils as fw from sagemaker.tensorflow import defaults from sagemaker.tensorflow.model import TensorFlowModel @@ -327,7 +327,9 @@ def hyperparameters(self): ) additional_hyperparameters["model_dir"] = self.model_dir - hyperparameters.update(Framework._json_encode_hyperparameters(additional_hyperparameters)) + hyperparameters.update( + EstimatorBase._json_encode_hyperparameters(additional_hyperparameters) + ) return hyperparameters def _default_s3_path(self, directory, mpi=False): diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 248eda1aa5..f735769cf3 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -17,6 +17,8 @@ import os import subprocess from time import sleep +from sagemaker.fw_utils import UploadedCode + import pytest from botocore.exceptions import ClientError @@ -3350,3 +3352,112 @@ def test_image_name_map(sagemaker_session): ) assert e.image_uri == IMAGE_URI + + +@patch("sagemaker.git_utils.git_clone_repo") +def test_git_support_with_branch_and_commit_succeed_estimator_class( + git_clone_repo, sagemaker_session +): + git_clone_repo.side_effect = lambda gitconfig, entrypoint, source_dir=None, dependencies=None: { + "entry_point": "/tmp/repo_dir/entry_point", + "source_dir": None, + "dependencies": None, + } + git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} + entry_point = "entry_point" + fw = Estimator( + entry_point=entry_point, + git_config=git_config, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + image_uri=IMAGE_URI, + ) + fw.fit() + git_clone_repo.assert_called_once_with(git_config, entry_point, None, None) + + +@patch("sagemaker.estimator.Estimator._stage_user_code_in_s3") +def test_script_mode_estimator(patched_stage_user_code, sagemaker_session): + patched_stage_user_code.return_value = UploadedCode( + s3_prefix="s3://bucket/key", script_name="script_name" + ) + script_uri = "s3://codebucket/someprefix/sourcedir.tar.gz" + image_uri = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-gpu-py38" + model_uri = "s3://someprefix2/models/model.tar.gz" + t = Estimator( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + source_dir=script_uri, + image_uri=image_uri, + model_uri=model_uri, + ) + t.fit("s3://bucket/mydata") + + patched_stage_user_code.assert_called_once() + sagemaker_session.train.assert_called_once() + + +@patch("time.time", return_value=TIME) +@patch("sagemaker.estimator.tar_and_upload_dir") +def test_script_mode_estimator_same_calls_as_framework( + patched_tar_and_upload_dir, sagemaker_session +): + + patched_tar_and_upload_dir.return_value = UploadedCode( + s3_prefix="s3://%s/%s" % ("bucket", "key"), script_name="script_name" + ) + sagemaker_session.boto_region_name = REGION + + script_uri = "s3://codebucket/someprefix/sourcedir.tar.gz" + + instance_type = "ml.p2.xlarge" + instance_count = 1 + + model_uri = "s3://someprefix2/models/model.tar.gz" + training_data_uri = "s3://bucket/mydata" + + generic_estimator = Estimator( + entry_point=SCRIPT_PATH, + role=ROLE, + region=REGION, + sagemaker_session=sagemaker_session, + instance_count=instance_count, + instance_type=instance_type, + source_dir=script_uri, + image_uri=IMAGE_URI, + model_uri=model_uri, + environment={"USE_SMDEBUG": "0"}, + dependencies=[], + debugger_hook_config={}, + ) + generic_estimator.fit(training_data_uri) + + generic_estimator_tar_and_upload_dir_args = patched_tar_and_upload_dir.call_args_list + generic_estimator_train_args = sagemaker_session.train.call_args_list + + patched_tar_and_upload_dir.reset_mock() + sagemaker_session.train.reset_mock() + + framework_estimator = DummyFramework( + entry_point=SCRIPT_PATH, + role=ROLE, + region=REGION, + source_dir=script_uri, + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + model_uri=model_uri, + dependencies=[], + debugger_hook_config={}, + ) + framework_estimator.fit(training_data_uri) + + assert len(generic_estimator_tar_and_upload_dir_args) == 1 + assert len(generic_estimator_train_args) == 1 + assert generic_estimator_tar_and_upload_dir_args == patched_tar_and_upload_dir.call_args_list + assert generic_estimator_train_args == sagemaker_session.train.call_args_list