Merge pull request aws#11 from verdimrc/fp-get-run-args

verdimrc · web-flow · commit dd48ee67f8a4 · 2021-05-20T14:52:14.000+08:00
FrameworkProcessor.get_run_args()
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -1230,7 +1230,7 @@ def __init__(
         instance_type,
         py_version="py3",  # New kwarg
         image_uri=None,
-        command=["python"],
+        command=["python3"],
         volume_size_in_gb=30,
         volume_kms_key=None,
         output_kms_key=None,
@@ -1359,6 +1359,60 @@ def _pre_init_normalization(
 
         return image_uri, base_job_name
 
+    def get_run_args(
+        self,
+        code,
+        source_dir=None,
+        dependencies=None,
+        git_config=None,
+        inputs=None,
+        outputs=None,
+        arguments=None,
+        job_name=None,
+    ):
+        """Returns a RunArgs object.
+
+        This object contains the normalized inputs, outputs and arguments needed
+        when using a ``FrameworkProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
+
+        Args:
+            code (str): This can be an S3 URI or a local path to a file with the framework
+                script to run. See the ``code`` argument in
+                `sagemaker.processing.FrameworkProcessor.run()`.
+            source_dir (str): Path (absolute, relative, or an S3 URI) to a directory wit
+                any other processing source code dependencies aside from the entrypoint
+                file (default: None). See the ``source_dir`` argument in
+                `sagemaker.processing.FrameworkProcessor.run()`
+            dependencies (list[str]): A list of paths to directories (absolute or relative)
+                with any additional libraries that will be exported to the container
+                (default: []). See the ``dependencies`` argument in
+                `sagemaker.processing.FrameworkProcessor.run()`.
+            git_config (dict[str, str]): Git configurations used for cloning files. See the
+                `git_config` argument in `sagemaker.processing.FrameworkProcessor.run()`.
+            inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
+                the processing job. These must be provided as
+                :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
+            outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
+                the processing job. These can be specified as either path strings or
+                :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
+            arguments (list[str]): A list of string arguments to be passed to a
+                processing job (default: None).
+            job_name (str): Processing job name. If not specified, the processor generates
+                a default job name, based on the base job name and current timestamp.
+        """
+        # When job_name is None, the job_name to upload code (+payload) will
+        # differ from job_name used by run().
+        s3_runproc_sh, inputs, job_name = self._pack_and_upload_code(
+            code, source_dir, dependencies, git_config, job_name, inputs
+        )
+
+        return RunArgs(
+            s3_runproc_sh,
+            inputs=inputs,
+            outputs=outputs,
+            arguments=arguments,
+        )
+
     def run(  # type: ignore[override]
         self,
         code,
@@ -1377,15 +1431,17 @@ def run(  # type: ignore[override]
         """Runs a processing job.
 
         Args:
-            code (str): Path (absolute or relative) to the local Python source
-                file which should be executed as the entry point to training. If
-                ``source_dir`` is specified, then ``code`` must point to a file
-                located at the root of ``source_dir``.
+            code (str): This can be an S3 URI or a local path to a file with the
+                framework script to run.Path (absolute or relative) to the local
+                Python source file which should be executed as the entry point
+                to training. When `code` is an S3 URI, ignore `source_dir`,
+                `dependencies, and `git_config`. If ``source_dir`` is specified,
+                then ``code`` must point to a file located at the root of ``source_dir``.
             source_dir (str): Path (absolute, relative or an S3 URI) to a directory
-                with any other training source code dependencies aside from the entry
+                with any other processing source code dependencies aside from the entry
                 point file (default: None). If ``source_dir`` is an S3 URI, it must
                 point to a tar.gz file. Structure within this directory are preserved
-                when training on Amazon SageMaker (default: None).
+                when processing on Amazon SageMaker (default: None).
             dependencies (list[str]): A list of paths to directories (absolute
                 or relative) with any additional libraries that will be exported
                 to the container (default: []). The library folders will be
@@ -1461,12 +1517,40 @@ def run(  # type: ignore[override]
             kms_key (str): The ARN of the KMS key that is used to encrypt the
                 user code file (default: None).
         """
-        if job_name is None:
-            job_name = self._generate_current_job_name()
+        s3_runproc_sh, inputs, job_name = self._pack_and_upload_code(
+            code, source_dir, dependencies, git_config, job_name, inputs
+        )
 
-        estimator = self._upload_payload(code, source_dir, dependencies, git_config, job_name)
+        # Submit a processing job.
+        super().run(
+            code=s3_runproc_sh,
+            inputs=inputs,
+            outputs=outputs,
+            arguments=arguments,
+            wait=wait,
+            logs=logs,
+            job_name=job_name,
+            experiment_config=experiment_config,
+            kms_key=kms_key,
+        )
+
+    def _pack_and_upload_code(self, code, source_dir, dependencies, git_config, job_name, inputs):
+        if code.startswith("s3://"):
+            return code, inputs, job_name
+
+        if job_name is None:
+            job_name = self._generate_current_job_name(job_name)
+
+        estimator = self._upload_payload(
+            code,
+            source_dir,
+            dependencies,
+            git_config,
+            job_name,
+        )
         inputs = self._patch_inputs_with_payload(
-            inputs, estimator._hyperparameters["sagemaker_submit_directory"]
+            inputs,
+            estimator._hyperparameters["sagemaker_submit_directory"],
         )
 
         local_code = get_config_value("local.local_code", self.sagemaker_session.config)
@@ -1490,18 +1574,7 @@ def run(  # type: ignore[override]
         )
         logger.info("runproc.sh uploaded to %s", s3_runproc_sh)
 
-        # Submit a processing job.
-        super().run(
-            code=s3_runproc_sh,
-            inputs=inputs,
-            outputs=outputs,
-            arguments=arguments,
-            wait=wait,
-            logs=logs,
-            job_name=job_name,
-            experiment_config=experiment_config,
-            kms_key=kms_key,
-        )
+        return s3_runproc_sh, inputs, job_name
 
     def _generate_framework_script(self, user_script: str) -> str:
         """Generate the framework entrypoint file (as text) for a processing job.
@@ -1525,7 +1598,12 @@ def _generate_framework_script(self, user_script: str) -> str:
             # Exit on any error. SageMaker uses error code to mark failed job.
             set -e
 
-            [[ -f 'requirements.txt' ]] && pip install -r requirements.txt
+            if [[ -f 'requirements.txt' ]]; then
+                # Some py3 containers has typing, which may breaks pip install
+                pip uninstall --yes typing
+
+                pip install -r requirements.txt
+            fi
 
             {entry_point_command} {entry_point} "$@"
         """
diff --git a/src/sagemaker/sklearn/processing.py b/src/sagemaker/sklearn/processing.py
@@ -48,7 +48,7 @@ def __init__(
         instance_type,
         py_version="py3",  # New kwarg
         image_uri=None,
-        command=["python"],
+        command=["python3"],
         volume_size_in_gb=30,
         volume_kms_key=None,
         output_kms_key=None,
diff --git a/tests/unit/test_processing.py b/tests/unit/test_processing.py
@@ -162,7 +162,7 @@ def test_sklearn_with_all_parameters(
 @patch("os.path.exists", return_value=True)
 @patch("os.path.isfile", return_value=True)
 def test_sklearn_with_all_parameters_via_run_args(
-    exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session
+    exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session, uploaded_code
 ):
     botocore_resolver.return_value.construct_endpoint.return_value = {"hostname": ECR_HOSTNAME}
     custom_command = ["Rscript"]
@@ -190,28 +190,31 @@ def test_sklearn_with_all_parameters_via_run_args(
         sagemaker_session=sagemaker_session,
     )
 
-    # FIXME: to check FrameworkProcessor.get_run_args(), and possibly fix with
-    # source_dir, dependencies.
-    run_args = processor.get_run_args(
-        code="/local/path/to/processing_code.py",
-        inputs=_get_data_inputs_all_parameters(),
-        outputs=_get_data_outputs_all_parameters(),
-        arguments=["--drop-columns", "'SelfEmployed'"],
-    )
+    with patch("sagemaker.estimator.tar_and_upload_dir", return_value=uploaded_code):
+        run_args = processor.get_run_args(
+            code="processing_code.py",
+            source_dir="/local/path/to/source_dir",
+            dependencies=["/local/path/to/dep_01"],
+            git_config=None,
+            inputs=_get_data_inputs_all_parameters(),
+            outputs=_get_data_outputs_all_parameters(),
+            arguments=["--drop-columns", "'SelfEmployed'"],
+        )
 
-    processor.run(
-        code=run_args.code,
-        inputs=run_args.inputs,
-        outputs=run_args.outputs,
-        arguments=run_args.arguments,
-        wait=True,
-        logs=False,
-        experiment_config={"ExperimentName": "AnExperiment"},
-    )
+        processor.run(
+            code=run_args.code,
+            inputs=run_args.inputs,
+            outputs=run_args.outputs,
+            arguments=run_args.arguments,
+            wait=True,
+            logs=False,
+            experiment_config={"ExperimentName": "AnExperiment"},
+        )
 
     expected_args = _get_expected_args_all_parameters_modular_code(
         processor._current_job_name,
         instance_count=2,
+        code_s3_prefix=run_args.code.replace("/runproc.sh", ""),
     )
     sklearn_image_uri = (
         "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3"
@@ -235,7 +238,7 @@ def test_sklearn_with_all_parameters_via_run_args(
 @patch("os.path.exists", return_value=True)
 @patch("os.path.isfile", return_value=True)
 def test_sklearn_with_all_parameters_via_run_args_called_twice(
-    exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session
+    exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session, uploaded_code
 ):
     botocore_resolver.return_value.construct_endpoint.return_value = {"hostname": ECR_HOSTNAME}
 
@@ -261,15 +264,22 @@ def test_sklearn_with_all_parameters_via_run_args_called_twice(
         sagemaker_session=sagemaker_session,
     )
 
-    run_args = processor.get_run_args(
-        code="/local/path/to/processing_code.py",
-        inputs=_get_data_inputs_all_parameters(),
-        outputs=_get_data_outputs_all_parameters(),
-        arguments=["--drop-columns", "'SelfEmployed'"],
-    )
+    with patch("sagemaker.estimator.tar_and_upload_dir", return_value=uploaded_code):
+        run_args = processor.get_run_args(
+            code="processing_code.py",
+            source_dir="/local/path/to/source_dir",
+            dependencies=["/local/path/to/dep_01"],
+            git_config=None,
+            inputs=_get_data_inputs_all_parameters(),
+            outputs=_get_data_outputs_all_parameters(),
+            arguments=["--drop-columns", "'SelfEmployed'"],
+        )
 
     run_args = processor.get_run_args(
         code="/local/path/to/processing_code.py",
+        source_dir=None,
+        dependencies=None,
+        git_config=None,
         inputs=_get_data_inputs_all_parameters(),
         outputs=_get_data_outputs_all_parameters(),
         arguments=["--drop-columns", "'SelfEmployed'"],
@@ -285,7 +295,10 @@ def test_sklearn_with_all_parameters_via_run_args_called_twice(
         experiment_config={"ExperimentName": "AnExperiment"},
     )
 
-    expected_args = _get_expected_args_all_parameters_modular_code(processor._current_job_name)
+    expected_args = _get_expected_args_all_parameters_modular_code(
+        processor._current_job_name,
+        code_s3_prefix=run_args.code.replace("/runproc.sh", ""),
+    )
     sklearn_image_uri = (
         "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3"
     ).format(sklearn_version)
@@ -839,9 +852,14 @@ def _get_data_outputs_all_parameters():
 
 
 def _get_expected_args_all_parameters_modular_code(
-    job_name, code_s3_uri=MOCKED_S3_URI, instance_count=1
+    job_name,
+    code_s3_uri=MOCKED_S3_URI,
+    instance_count=1,
+    code_s3_prefix=None,
 ):
-    # Add something to inputs
+    if code_s3_prefix is None:
+        code_s3_prefix = f"{code_s3_uri}/{job_name}/source"
+
     return {
         "inputs": [
             {
@@ -911,7 +929,7 @@ def _get_expected_args_all_parameters_modular_code(
                 "InputName": "code",
                 "AppManaged": False,
                 "S3Input": {
-                    "S3Uri": f"{code_s3_uri}/{job_name}/source/sourcedir.tar.gz",
+                    "S3Uri": f"{code_s3_prefix}/sourcedir.tar.gz",
                     "LocalPath": "/opt/ml/processing/input/code/",
                     "S3DataType": "S3Prefix",
                     "S3InputMode": "File",
@@ -923,7 +941,7 @@ def _get_expected_args_all_parameters_modular_code(
                 "InputName": "entrypoint",
                 "AppManaged": False,
                 "S3Input": {
-                    "S3Uri": f"{code_s3_uri}/{job_name}/source/runproc.sh",
+                    "S3Uri": f"{code_s3_prefix}/runproc.sh",
                     "LocalPath": "/opt/ml/processing/input/entrypoint",
                     "S3DataType": "S3Prefix",
                     "S3InputMode": "File",