aws · ahsan-z-khan · Aug 6, 2021 · Aug 6, 2021 · Aug 6, 2021 · Aug 6, 2021
@@ -17,67 +17,92 @@
 """
 from __future__ import absolute_import
 
-from sagemaker.processing import FrameworkProcessor
-from sagemaker.sklearn.estimator import SKLearn
+from sagemaker import image_uris, Session
+from sagemaker.processing import ScriptProcessor
+from sagemaker.sklearn import defaults
 
 
-class SKLearnProcessor(FrameworkProcessor):
-    """Initialize an ``SKLearnProcessor`` instance.
-
-    The SKLearnProcessor handles Amazon SageMaker processing tasks for jobs using scikit-learn.
-
-    Unless ``image_uri`` is specified, the scikit-learn environment is an
-    Amazon-built Docker container that executes functions defined in the supplied
-    ``code`` Python script.
-
-    The arguments have the exact same meaning as in ``FrameworkProcessor``.
-
-    .. tip::
-
-        You can find additional parameters for initializing this class at
-        :class:`~sagemaker.processing.FrameworkProcessor`.
-    """
-
-    estimator_cls = SKLearn
+class SKLearnProcessor(ScriptProcessor):
+    """Handles Amazon SageMaker processing tasks for jobs using scikit-learn."""
 
     def __init__(
         self,
-        framework_version,  # New arg
+        framework_version,
         role,
-        instance_count,
         instance_type,
-        py_version="py3",  # New kwarg
-        image_uri=None,
+        instance_count,
         command=None,
         volume_size_in_gb=30,
         volume_kms_key=None,
         output_kms_key=None,
-        code_location=None,  # New arg
         max_runtime_in_seconds=None,
         base_job_name=None,
         sagemaker_session=None,
         env=None,
         tags=None,
         network_config=None,
     ):
-        """This processor executes a Python script in a scikit-learn execution environment."""
-        super().__init__(
-            self.estimator_cls,
-            framework_version,
-            role,
-            instance_count,
-            instance_type,
-            py_version,
-            image_uri,
-            command,
-            volume_size_in_gb,
-            volume_kms_key,
-            output_kms_key,
-            code_location,
-            max_runtime_in_seconds,
-            base_job_name,
-            sagemaker_session,
-            env,
-            tags,
-            network_config,
+        """Initialize an ``SKLearnProcessor`` instance.
+
+        The SKLearnProcessor handles Amazon SageMaker processing tasks for jobs using scikit-learn.
+        Args:
+            framework_version (str): The version of scikit-learn.
+            role (str): An AWS IAM role name or ARN. The Amazon SageMaker training jobs
+                and APIs that create Amazon SageMaker endpoints use this role
+                to access training data and model artifacts. After the endpoint
+                is created, the inference code might use the IAM role, if it
+                needs to access an AWS resource.
+            instance_type (str): Type of EC2 instance to use for
+                processing, for example, 'ml.c4.xlarge'.
+            instance_count (int): The number of instances to run
+                the Processing job with. Defaults to 1.
+            command ([str]): The command to run, along with any command-line flags.
+                Example: ["python3", "-v"]. If not provided, ["python3"] or ["python2"]
+                will be chosen based on the py_version parameter.
+            volume_size_in_gb (int): Size in GB of the EBS volume to
+                use for storing data during processing (default: 30).
+            volume_kms_key (str): A KMS key for the processing
+                volume.
+            output_kms_key (str): The KMS key id for all ProcessingOutputs.
+            max_runtime_in_seconds (int): Timeout in seconds.
+                After this amount of time Amazon SageMaker terminates the job
+                regardless of its current status.
+            base_job_name (str): Prefix for processing name. If not specified,
+                the processor generates a default job name, based on the
+                training image name and current timestamp.
+            sagemaker_session (sagemaker.session.Session): Session object which
+                manages interactions with Amazon SageMaker APIs and any other
+                AWS services needed. If not specified, the processor creates one
+                using the default AWS configuration chain.
+            env (dict): Environment variables to be passed to the processing job.
+            tags ([dict]): List of tags to be passed to the processing job.
+            network_config (sagemaker.network.NetworkConfig): A NetworkConfig
+                object that configures network isolation, encryption of
+                inter-container traffic, security group IDs, and subnets.
+        """
+        if not command:
+            command = ["python3"]
+
+        session = sagemaker_session or Session()
+        region = session.boto_region_name
+
+        image_uri = image_uris.retrieve(
+            defaults.SKLEARN_NAME, region, version=framework_version, instance_type=instance_type
+        )
+
+        super(SKLearnProcessor, self).__init__(
+            role=role,
+            image_uri=image_uri,
+            instance_count=instance_count,
+            instance_type=instance_type,
+            command=command,
+            volume_size_in_gb=volume_size_in_gb,
+            volume_kms_key=volume_kms_key,
+            output_kms_key=output_kms_key,
+            max_runtime_in_seconds=max_runtime_in_seconds,
+            base_job_name=base_job_name,
+            sagemaker_session=session,
+            env=env,
+            tags=tags,
+            network_config=network_config,
         )
@@ -125,6 +125,7 @@ def test_sklearn(sagemaker_session, sklearn_latest_version, cpu_instance_type):
         role=ROLE,
         instance_type=cpu_instance_type,
         instance_count=1,
+        command=["python3"],
         sagemaker_session=sagemaker_session,
         base_job_name="test-sklearn",
     )
@@ -138,16 +139,16 @@ def test_sklearn(sagemaker_session, sklearn_latest_version, cpu_instance_type):
 
     job_description = sklearn_processor.latest_job.describe()
 
-    assert len(job_description["ProcessingInputs"]) == 3
+    assert len(job_description["ProcessingInputs"]) == 2
     assert job_description["ProcessingResources"]["ClusterConfig"]["InstanceCount"] == 1
     assert (
         job_description["ProcessingResources"]["ClusterConfig"]["InstanceType"] == cpu_instance_type
     )
     assert job_description["ProcessingResources"]["ClusterConfig"]["VolumeSizeInGB"] == 30
     assert job_description["StoppingCondition"] == {"MaxRuntimeInSeconds": 86400}
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
-        "/bin/bash",
-        "/opt/ml/processing/input/entrypoint/runproc.sh",
+        "python3",
+        "/opt/ml/processing/input/code/dummy_script.py",
     ]
     assert ROLE in job_description["RoleArn"]
 
@@ -203,7 +204,6 @@ def test_sklearn_with_customizations(
     assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"
 
     assert job_description["ProcessingInputs"][1]["InputName"] == "code"
-    assert job_description["ProcessingInputs"][2]["InputName"] == "entrypoint"
 
     assert job_description["ProcessingJobName"].startswith("test-sklearn-with-customizations")
 
@@ -220,8 +220,8 @@ def test_sklearn_with_customizations(
 
     assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
-        "/bin/bash",
-        "/opt/ml/processing/input/entrypoint/runproc.sh",
+        "python3",
+        "/opt/ml/processing/input/code/dummy_script.py",
     ]
     assert job_description["AppSpecification"]["ImageUri"] == image_uri
 
@@ -245,6 +245,7 @@ def test_sklearn_with_custom_default_bucket(
     sklearn_processor = SKLearnProcessor(
         framework_version=sklearn_latest_version,
         role=ROLE,
+        command=["python3"],
         instance_type=cpu_instance_type,
         instance_count=1,
         volume_size_in_gb=100,
@@ -287,9 +288,6 @@ def test_sklearn_with_custom_default_bucket(
     assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"
     assert custom_bucket_name in job_description["ProcessingInputs"][0]["S3Input"]["S3Uri"]
 
-    assert job_description["ProcessingInputs"][1]["InputName"] == "code"
-    assert custom_bucket_name in job_description["ProcessingInputs"][1]["S3Input"]["S3Uri"]
-
     assert job_description["ProcessingInputs"][2]["InputName"] == "entrypoint"
     assert custom_bucket_name in job_description["ProcessingInputs"][2]["S3Input"]["S3Uri"]
 
@@ -308,8 +306,8 @@ def test_sklearn_with_custom_default_bucket(
 
     assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
-        "/bin/bash",
-        "/opt/ml/processing/input/entrypoint/runproc.sh",
+        "python3",
+        "/opt/ml/processing/input/code/dummy_script.py",
     ]
     assert job_description["AppSpecification"]["ImageUri"] == image_uri
 
@@ -326,6 +324,7 @@ def test_sklearn_with_no_inputs_or_outputs(
     sklearn_processor = SKLearnProcessor(
         framework_version=sklearn_latest_version,
         role=ROLE,
+        command=["python3"],
         instance_type=cpu_instance_type,
         instance_count=1,
         volume_size_in_gb=100,
@@ -338,16 +337,12 @@ def test_sklearn_with_no_inputs_or_outputs(
     )
 
     sklearn_processor.run(
-        code=os.path.join(DATA_DIR, "dummy_script.py"),
-        arguments=["-v"],
-        wait=True,
-        logs=True,
+        code=os.path.join(DATA_DIR, "dummy_script.py"), arguments=["-v"], wait=True, logs=True
     )
 
     job_description = sklearn_processor.latest_job.describe()
 
     assert job_description["ProcessingInputs"][0]["InputName"] == "code"
-    assert job_description["ProcessingInputs"][1]["InputName"] == "entrypoint"
 
     assert job_description["ProcessingJobName"].startswith("test-sklearn-with-no-inputs")
 
@@ -361,8 +356,8 @@ def test_sklearn_with_no_inputs_or_outputs(
 
     assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
     assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
-        "/bin/bash",
-        "/opt/ml/processing/input/entrypoint/runproc.sh",
+        "python3",
+        "/opt/ml/processing/input/code/dummy_script.py",
     ]
     assert job_description["AppSpecification"]["ImageUri"] == image_uri