aws · verdimrc · Aug 7, 2021 · Aug 7, 2021 · Aug 7, 2021 · Aug 7, 2021
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -18,6 +18,7 @@
 """
 from __future__ import print_function, absolute_import
 
+import json
 import os
 import pathlib
 import logging
@@ -1265,7 +1266,7 @@ class FeatureStoreOutput(ApiObject):
 class FrameworkProcessor(ScriptProcessor):
     """Handles Amazon SageMaker processing tasks for jobs using a machine learning framework."""
 
-    framework_entrypoint_command = ["/bin/bash"]
+    framework_entrypoint_command = ["python3"]
 
     # Added new (kw)args for estimator. The rest are from ScriptProcessor with same defaults.
     def __init__(
@@ -1343,7 +1344,7 @@ def __init__(
                 inter-container traffic, security group IDs, and subnets (default: None).
         """
         if not command:
-            command = ["python"]
+            command = ["python3"]
 
         self.estimator_cls = estimator_cls
         self.framework_version = framework_version
@@ -1466,12 +1467,12 @@ def get_run_args(
         """
         # When job_name is None, the job_name to upload code (+payload) will
         # differ from job_name used by run().
-        s3_runproc_sh, inputs, job_name = self._pack_and_upload_code(
+        s3_runproc_py, inputs, job_name = self._pack_and_upload_code(
             code, source_dir, dependencies, git_config, job_name, inputs
         )
 
         return RunArgs(
-            s3_runproc_sh,
+            s3_runproc_py,
             inputs=inputs,
             outputs=outputs,
             arguments=arguments,
@@ -1589,13 +1590,13 @@ def run(  # type: ignore[override]
             kms_key (str): The ARN of the KMS key that is used to encrypt the
                 user code file (default: None).
         """
-        s3_runproc_sh, inputs, job_name = self._pack_and_upload_code(
+        s3_runproc_py, inputs, job_name = self._pack_and_upload_code(
             code, source_dir, dependencies, git_config, job_name, inputs
         )
 
         # Submit a processing job.
         super().run(
-            code=s3_runproc_sh,
+            code=s3_runproc_py,
             inputs=inputs,
             outputs=outputs,
             arguments=arguments,
@@ -1635,20 +1636,20 @@ def _pack_and_upload_code(self, code, source_dir, dependencies, git_config, job_
                 "automatically."
             )
 
-        # Upload the bootstrapping code as s3://.../jobname/source/runproc.sh.
+        # Upload the bootstrapping code as s3://.../jobname/source/runproc.py.
         entrypoint_s3_uri = estimator.uploaded_code.s3_prefix.replace(
             "sourcedir.tar.gz",
-            "runproc.sh",
+            "runproc.py",
         )
         script = estimator.uploaded_code.script_name
-        s3_runproc_sh = S3Uploader.upload_string_as_file_body(
+        s3_runproc_py = S3Uploader.upload_string_as_file_body(
             self._generate_framework_script(script),
             desired_s3_uri=entrypoint_s3_uri,
             sagemaker_session=self.sagemaker_session,
         )
-        logger.info("runproc.sh uploaded to %s", s3_runproc_sh)
+        logger.info("runproc.py uploaded to %s", s3_runproc_py)
 
-        return s3_runproc_sh, inputs, job_name
+        return s3_runproc_py, inputs, job_name
 
     def _generate_framework_script(self, user_script: str) -> str:
         """Generate the framework entrypoint file (as text) for a processing job.
@@ -1664,27 +1665,29 @@ def _generate_framework_script(self, user_script: str) -> str:
         """
         return dedent(
             """\
-            #!/bin/bash
+            import os
+            import subprocess
+            import sys
+            import tarfile
 
-            cd /opt/ml/processing/input/code/
-            tar -xzf sourcedir.tar.gz
 
-            # Exit on any error. SageMaker uses error code to mark failed job.
-            set -e
+            if __name__ == "__main__":
+                os.chdir("/opt/ml/processing/input/code")
 
-            if [[ -f 'requirements.txt' ]]; then
-                # Some py3 containers has typing, which may breaks pip install
-                pip uninstall --yes typing
+                with tarfile.open("sourcedir.tar.gz", "r:gz") as tar:
+                    tar.extractall()
 
-                pip install -r requirements.txt
-            fi
+                if os.path.isfile("requirements.txt"):
+                    # Some py3 containers has typing, which may breaks pip install
+                    subprocess.run(["pip", "uninstall", "--yes", "typing"])
 
-            {entry_point_command} {entry_point} "$@"
+                    subprocess.run(["pip", "install", "-r", "requirements.txt"])
+
+                cmd = {entry_point_command} + sys.argv[1:]
+                print(' '.join(cmd))
+                subprocess.run(cmd)
         """
-        ).format(
-            entry_point_command=" ".join(self.command),
-            entry_point=user_script,
-        )
+        ).format(entry_point_command=json.dumps(self.command + [user_script]))
 
     def _upload_payload(
         self,
@@ -1721,7 +1724,7 @@ def _patch_inputs_with_payload(self, inputs, s3_payload) -> List[ProcessingInput
         # Follow the exact same mechanism that ScriptProcessor does, which
         # is to inject the S3 code artifact as a processing input. Note that
         # framework processor take-over /opt/ml/processing/input/code for
-        # sourcedir.tar.gz, and let ScriptProcessor to place runproc.sh under
+        # sourcedir.tar.gz, and let ScriptProcessor to place runproc.py under
         # /opt/ml/processing/input/{self._CODE_CONTAINER_INPUT_NAME}.
         #
         # See:

diff --git a/src/sagemaker/sklearn/processing.py b/src/sagemaker/sklearn/processing.py
@@ -17,93 +17,67 @@
 """
 from __future__ import absolute_import
 
-from sagemaker import image_uris, Session
-from sagemaker.processing import ScriptProcessor
-from sagemaker.sklearn import defaults
+from sagemaker.processing import FrameworkProcessor
+from sagemaker.sklearn.estimator import SKLearn
 
 
-class SKLearnProcessor(ScriptProcessor):
-    """Handles Amazon SageMaker processing tasks for jobs using scikit-learn."""
+class SKLearnProcessor(FrameworkProcessor):
+    """Initialize an ``SKLearnProcessor`` instance.
+
+    The SKLearnProcessor handles Amazon SageMaker processing tasks for jobs using scikit-learn.
+
+    Unless ``image_uri`` is specified, the scikit-learn environment is an
+    Amazon-built Docker container that executes functions defined in the supplied
+    ``code`` Python script.
+
+    The arguments have the exact same meaning as in ``FrameworkProcessor``.
+
+    .. tip::
+
+        You can find additional parameters for initializing this class at
+        :class:`~sagemaker.processing.FrameworkProcessor`.
+    """
+
+    estimator_cls = SKLearn
 
     def __init__(
         self,
-        framework_version,
+        framework_version,  # New arg
         role,
-        instance_type,
         instance_count,
+        instance_type,
+        py_version="py3",  # New kwarg
+        image_uri=None,
         command=None,
         volume_size_in_gb=30,
         volume_kms_key=None,
         output_kms_key=None,
+        code_location=None,  # New arg
         max_runtime_in_seconds=None,
         base_job_name=None,
         sagemaker_session=None,
         env=None,
         tags=None,
         network_config=None,
     ):
-        """Initialize an ``SKLearnProcessor`` instance.
-
-        The SKLearnProcessor handles Amazon SageMaker processing tasks for jobs using scikit-learn.
-
-        Args:
-            framework_version (str): The version of scikit-learn.
-            role (str): An AWS IAM role name or ARN. The Amazon SageMaker training jobs
-                and APIs that create Amazon SageMaker endpoints use this role
-                to access training data and model artifacts. After the endpoint
-                is created, the inference code might use the IAM role, if it
-                needs to access an AWS resource.
-            instance_type (str): Type of EC2 instance to use for
-                processing, for example, 'ml.c4.xlarge'.
-            instance_count (int): The number of instances to run
-                the Processing job with. Defaults to 1.
-            command ([str]): The command to run, along with any command-line flags.
-                Example: ["python3", "-v"]. If not provided, ["python3"] or ["python2"]
-                will be chosen based on the py_version parameter.
-            volume_size_in_gb (int): Size in GB of the EBS volume to
-                use for storing data during processing (default: 30).
-            volume_kms_key (str): A KMS key for the processing
-                volume.
-            output_kms_key (str): The KMS key id for all ProcessingOutputs.
-            max_runtime_in_seconds (int): Timeout in seconds.
-                After this amount of time Amazon SageMaker terminates the job
-                regardless of its current status.
-            base_job_name (str): Prefix for processing name. If not specified,
-                the processor generates a default job name, based on the
-                training image name and current timestamp.
-            sagemaker_session (sagemaker.session.Session): Session object which
-                manages interactions with Amazon SageMaker APIs and any other
-                AWS services needed. If not specified, the processor creates one
-                using the default AWS configuration chain.
-            env (dict): Environment variables to be passed to the processing job.
-            tags ([dict]): List of tags to be passed to the processing job.
-            network_config (sagemaker.network.NetworkConfig): A NetworkConfig
-                object that configures network isolation, encryption of
-                inter-container traffic, security group IDs, and subnets.
-        """
-        if not command:
-            command = ["python3"]
-
-        session = sagemaker_session or Session()
-        region = session.boto_region_name
-
-        image_uri = image_uris.retrieve(
-            defaults.SKLEARN_NAME, region, version=framework_version, instance_type=instance_type
-        )
-
-        super(SKLearnProcessor, self).__init__(
-            role=role,
-            image_uri=image_uri,
-            instance_count=instance_count,
-            instance_type=instance_type,
-            command=command,
-            volume_size_in_gb=volume_size_in_gb,
-            volume_kms_key=volume_kms_key,
-            output_kms_key=output_kms_key,
-            max_runtime_in_seconds=max_runtime_in_seconds,
-            base_job_name=base_job_name,
-            sagemaker_session=session,
-            env=env,
-            tags=tags,
-            network_config=network_config,
+        """This processor executes a Python script in a scikit-learn execution environment."""
+        super().__init__(
+            self.estimator_cls,
+            framework_version,
+            role,
+            instance_count,
+            instance_type,
+            py_version,
+            image_uri,
+            command,
+            volume_size_in_gb,
+            volume_kms_key,
+            output_kms_key,
+            code_location,
+            max_runtime_in_seconds,
+            base_job_name,
+            sagemaker_session,
+            env,
+            tags,
+            network_config,
         )
diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py
@@ -349,12 +349,12 @@ def test_local_processing_sklearn(sagemaker_local_session_no_local_code, sklearn
 
     job_description = sklearn_processor.latest_job.describe()
 
-    assert len(job_description["ProcessingInputs"]) == 2
+    assert len(job_description["ProcessingInputs"]) == 3
     assert job_description["ProcessingResources"]["ClusterConfig"]["InstanceCount"] == 1
     assert job_description["ProcessingResources"]["ClusterConfig"]["InstanceType"] == "local"
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
         "python3",
-        "/opt/ml/processing/input/code/dummy_script.py",
+        "/opt/ml/processing/input/entrypoint/runproc.py",
     ]
     assert job_description["RoleArn"] == "<no_role>"
 

diff --git a/tests/integ/test_processing.py b/tests/integ/test_processing.py
@@ -139,7 +139,7 @@ def test_sklearn(sagemaker_session, sklearn_latest_version, cpu_instance_type):
 
     job_description = sklearn_processor.latest_job.describe()
 
-    assert len(job_description["ProcessingInputs"]) == 2
+    assert len(job_description["ProcessingInputs"]) == 3
     assert job_description["ProcessingResources"]["ClusterConfig"]["InstanceCount"] == 1
     assert (
         job_description["ProcessingResources"]["ClusterConfig"]["InstanceType"] == cpu_instance_type
@@ -148,7 +148,7 @@ def test_sklearn(sagemaker_session, sklearn_latest_version, cpu_instance_type):
     assert job_description["StoppingCondition"] == {"MaxRuntimeInSeconds": 86400}
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
         "python3",
-        "/opt/ml/processing/input/code/dummy_script.py",
+        "/opt/ml/processing/input/entrypoint/runproc.py",
     ]
     assert ROLE in job_description["RoleArn"]
 
@@ -205,6 +205,7 @@ def test_sklearn_with_customizations(
     assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"
 
     assert job_description["ProcessingInputs"][1]["InputName"] == "code"
+    assert job_description["ProcessingInputs"][2]["InputName"] == "entrypoint"
 
     assert job_description["ProcessingJobName"].startswith("test-sklearn-with-customizations")
 
@@ -222,7 +223,7 @@ def test_sklearn_with_customizations(
     assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
         "python3",
-        "/opt/ml/processing/input/code/dummy_script.py",
+        "/opt/ml/processing/input/entrypoint/runproc.py",
     ]
     assert job_description["AppSpecification"]["ImageUri"] == image_uri
 
@@ -288,6 +289,13 @@ def test_sklearn_with_custom_default_bucket(
 
     assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"
     assert custom_bucket_name in job_description["ProcessingInputs"][0]["S3Input"]["S3Uri"]
+
+    assert job_description["ProcessingInputs"][1]["InputName"] == "code"
+    assert custom_bucket_name in job_description["ProcessingInputs"][1]["S3Input"]["S3Uri"]
+
+    assert job_description["ProcessingInputs"][2]["InputName"] == "entrypoint"
+    assert custom_bucket_name in job_description["ProcessingInputs"][2]["S3Input"]["S3Uri"]
+
     assert job_description["ProcessingJobName"].startswith("test-sklearn-with-customizations")
 
     assert job_description["ProcessingJobStatus"] == "Completed"
@@ -304,7 +312,7 @@ def test_sklearn_with_custom_default_bucket(
     assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
         "python3",
-        "/opt/ml/processing/input/code/dummy_script.py",
+        "/opt/ml/processing/input/entrypoint/runproc.py",
     ]
     assert job_description["AppSpecification"]["ImageUri"] == image_uri
 
@@ -340,6 +348,7 @@ def test_sklearn_with_no_inputs_or_outputs(
     job_description = sklearn_processor.latest_job.describe()
 
     assert job_description["ProcessingInputs"][0]["InputName"] == "code"
+    assert job_description["ProcessingInputs"][1]["InputName"] == "entrypoint"
 
     assert job_description["ProcessingJobName"].startswith("test-sklearn-with-no-inputs")
 
@@ -354,7 +363,7 @@ def test_sklearn_with_no_inputs_or_outputs(
     assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
         "python3",
-        "/opt/ml/processing/input/code/dummy_script.py",
+        "/opt/ml/processing/input/entrypoint/runproc.py",
     ]
     assert job_description["AppSpecification"]["ImageUri"] == image_uri
 

diff --git a/tests/integ/test_sklearn.py b/tests/integ/test_sklearn.py
@@ -46,6 +46,20 @@ def sklearn_training_job(
     sagemaker_session.boto_region_name
 
 
+def test_framework_processing_job_with_deps(
+    sagemaker_session,
+    sklearn_latest_version,
+    sklearn_latest_py_version,
+    cpu_instance_type,
+):
+    return _run_processing_job(
+        sagemaker_session,
+        cpu_instance_type,
+        sklearn_latest_version,
+        sklearn_latest_py_version,
+    )
+
+
 def test_training_with_additional_hyperparameters(
     sagemaker_session,
     sklearn_latest_version,