|
| 1 | +# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# # |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"). You |
| 4 | +# may not use this file except in compliance with the License. A copy of |
| 5 | +# the License is located at |
| 6 | +# # |
| 7 | +# http://aws.amazon.com/apache2.0/ |
| 8 | +# # |
| 9 | +# or in the "license" file accompanying this file. This file is |
| 10 | +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF |
| 11 | +# ANY KIND, either express or implied. See the License for the specific |
| 12 | +# language governing permissions and limitations under the License. |
| 13 | +"""The process definitions for data wrangler.""" |
| 14 | + |
| 15 | +from __future__ import absolute_import |
| 16 | + |
| 17 | +from typing import Dict, List |
| 18 | + |
| 19 | +from sagemaker.network import NetworkConfig |
| 20 | +from sagemaker.processing import ( |
| 21 | + ProcessingInput, |
| 22 | + Processor, |
| 23 | +) |
| 24 | +from sagemaker import image_uris |
| 25 | +from sagemaker.session import Session |
| 26 | + |
| 27 | + |
| 28 | +class DataWranglerProcessor(Processor): |
| 29 | + """Handles Amazon SageMaker DataWrangler tasks""" |
| 30 | + |
| 31 | + def __init__( |
| 32 | + self, |
| 33 | + role: str, |
| 34 | + data_wrangler_flow_source: str, |
| 35 | + instance_count: int, |
| 36 | + instance_type: str, |
| 37 | + volume_size_in_gb: int = 30, |
| 38 | + volume_kms_key: str = None, |
| 39 | + output_kms_key: str = None, |
| 40 | + max_runtime_in_seconds: int = None, |
| 41 | + base_job_name: str = None, |
| 42 | + sagemaker_session: Session = None, |
| 43 | + env: Dict[str, str] = None, |
| 44 | + tags: List[dict] = None, |
| 45 | + network_config: NetworkConfig = None, |
| 46 | + ): |
| 47 | + """Initializes a ``Processor`` instance. |
| 48 | +
|
| 49 | + The ``Processor`` handles Amazon SageMaker Processing tasks. |
| 50 | +
|
| 51 | + Args: |
| 52 | + role (str): An AWS IAM role name or ARN. Amazon SageMaker Processing |
| 53 | + uses this role to access AWS resources, such as |
| 54 | + data stored in Amazon S3. |
| 55 | + data_wrangler_flow_source (str): The source of the DaraWrangler flow which will be |
| 56 | + used for the DataWrangler job. If a local path is provided, it will automatically |
| 57 | + be uploaded to S3 under: |
| 58 | + "s3://<default-bucket-name>/<job-name>/input/<input-name>". |
| 59 | + instance_count (int): The number of instances to run |
| 60 | + a processing job with. |
| 61 | + instance_type (str): The type of EC2 instance to use for |
| 62 | + processing, for example, 'ml.c4.xlarge'. |
| 63 | + volume_size_in_gb (int): Size in GB of the EBS volume |
| 64 | + to use for storing data during processing (default: 30). |
| 65 | + volume_kms_key (str): A KMS key for the processing |
| 66 | + volume (default: None). |
| 67 | + output_kms_key (str): The KMS key ID for processing job outputs (default: None). |
| 68 | + max_runtime_in_seconds (int): Timeout in seconds (default: None). |
| 69 | + After this amount of time, Amazon SageMaker terminates the job, |
| 70 | + regardless of its current status. If `max_runtime_in_seconds` is not |
| 71 | + specified, the default value is 24 hours. |
| 72 | + base_job_name (str): Prefix for processing job name. If not specified, |
| 73 | + the processor generates a default job name, based on the |
| 74 | + processing image name and current timestamp. |
| 75 | + sagemaker_session (:class:`~sagemaker.session.Session`): |
| 76 | + Session object which manages interactions with Amazon SageMaker and |
| 77 | + any other AWS services needed. If not specified, the processor creates |
| 78 | + one using the default AWS configuration chain. |
| 79 | + env (dict[str, str]): Environment variables to be passed to |
| 80 | + the processing jobs (default: None). |
| 81 | + tags (list[dict]): List of tags to be passed to the processing job |
| 82 | + (default: None). For more, see |
| 83 | + https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. |
| 84 | + network_config (:class:`~sagemaker.network.NetworkConfig`): |
| 85 | + A :class:`~sagemaker.network.NetworkConfig` |
| 86 | + object that configures network isolation, encryption of |
| 87 | + inter-container traffic, security group IDs, and subnets. |
| 88 | + """ |
| 89 | + self.data_wrangler_flow_source = data_wrangler_flow_source |
| 90 | + self.sagemaker_session = sagemaker_session or Session() |
| 91 | + image_uri = image_uris.retrieve( |
| 92 | + "data-wrangler", region=self.sagemaker_session.boto_region_name |
| 93 | + ) |
| 94 | + super().__init__( |
| 95 | + role, |
| 96 | + image_uri, |
| 97 | + instance_count, |
| 98 | + instance_type, |
| 99 | + volume_size_in_gb=volume_size_in_gb, |
| 100 | + volume_kms_key=volume_kms_key, |
| 101 | + output_kms_key=output_kms_key, |
| 102 | + max_runtime_in_seconds=max_runtime_in_seconds, |
| 103 | + base_job_name=base_job_name, |
| 104 | + sagemaker_session=sagemaker_session, |
| 105 | + env=env, |
| 106 | + tags=tags, |
| 107 | + network_config=network_config, |
| 108 | + ) |
| 109 | + |
| 110 | + def _normalize_args( |
| 111 | + self, |
| 112 | + job_name=None, |
| 113 | + arguments=None, |
| 114 | + inputs=None, |
| 115 | + outputs=None, |
| 116 | + code=None, |
| 117 | + kms_key=None, |
| 118 | + ): |
| 119 | + """Normalizes the arguments so that they can be passed to the job run |
| 120 | +
|
| 121 | + Args: |
| 122 | + job_name (str): Name of the processing job to be created. If not specified, one |
| 123 | + is generated, using the base name given to the constructor, if applicable |
| 124 | + (default: None). |
| 125 | + arguments (list[str]): A list of string arguments to be passed to a |
| 126 | + processing job (default: None). |
| 127 | + inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for |
| 128 | + the processing job. These must be provided as |
| 129 | + :class:`~sagemaker.processing.ProcessingInput` objects (default: None). |
| 130 | + outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for |
| 131 | + the processing job. These can be specified as either path strings or |
| 132 | + :class:`~sagemaker.processing.ProcessingOutput` objects (default: None). |
| 133 | + code (str): This can be an S3 URI or a local path to a file with the framework |
| 134 | + script to run (default: None). A no op in the base class. |
| 135 | + kms_key (str): The ARN of the KMS key that is used to encrypt the |
| 136 | + user code file (default: None). |
| 137 | + """ |
| 138 | + inputs = inputs or [] |
| 139 | + found = any(element.input_name == "flow" for element in inputs) |
| 140 | + if not found: |
| 141 | + inputs.append(self._get_recipe_input()) |
| 142 | + return super()._normalize_args(job_name, arguments, inputs, outputs, code, kms_key) |
| 143 | + |
| 144 | + def _get_recipe_input(self): |
| 145 | + """Creates a ProcessingInput with Data Wrangler recipe uri and appends it to inputs""" |
| 146 | + return ProcessingInput( |
| 147 | + source=self.data_wrangler_flow_source, |
| 148 | + destination="/opt/ml/processing/flow", |
| 149 | + input_name="flow", |
| 150 | + s3_data_type="S3Prefix", |
| 151 | + s3_input_mode="File", |
| 152 | + s3_data_distribution_type="FullyReplicated", |
| 153 | + ) |
0 commit comments