feat(sagemaker): add min-maxCapacity for autoscaling of custom-sagemaker-endpoint (#685)

erdemayyildiz · Erdem Ayyildiz · web-flow · commit e1208764664b · 2024-09-13T10:13:44.000-07:00
* feat: add min-maxCapacity for autoscaling of custom-sagemaker-endpoint

---------

Co-authored-by: Erdem Ayyildiz &lt;erdemayy@amazon.com&gt;
diff --git a/apidocs/classes/CustomSageMakerEndpoint.md b/apidocs/classes/CustomSageMakerEndpoint.md
@@ -200,6 +200,12 @@ Default  log retention config for all constructs
 
 ***
 
+### scalingPolicy
+
+> `readonly` **scalingPolicy**: `StepScalingPolicy`
+
+***
+
 ### stage
 
 > **stage**: `string`
diff --git a/apidocs/interfaces/CustomSageMakerEndpointProps.md b/apidocs/interfaces/CustomSageMakerEndpointProps.md
@@ -48,6 +48,18 @@
 
 ***
 
+### maxCapacity?
+
+> `readonly` `optional` **maxCapacity**: `number`
+
+***
+
+### minCapacity?
+
+> `readonly` `optional` **minCapacity**: `number`
+
+***
+
 ### modelDataDownloadTimeoutInSeconds?
 
 > `readonly` `optional` **modelDataDownloadTimeoutInSeconds**: `number`
diff --git a/src/patterns/gen-ai/aws-model-deployment-sagemaker/README_custom_sagemaker_endpoint.md b/src/patterns/gen-ai/aws-model-deployment-sagemaker/README_custom_sagemaker_endpoint.md
@@ -62,7 +62,9 @@ new CustomSageMakerEndpoint(this, 'customModel', {
     modelDataUrl: 's3://{Bucket}/{Key}/model.tar.gz',
     endpointName: 'testbgebase',
     instanceCount: 1,
-    volumeSizeInGb: 100
+    volumeSizeInGb: 100,
+    minCapacity: 1,
+    maxCapacity: 2,
   });
 ```
 
@@ -92,6 +94,8 @@ CustomSageMakerEndpoint(
     endpoint_name='testbgebase',
     instance_count=1,
     volume_size_in_gb=100,
+    min_capacity=1,
+    max_capacity=2,
 )
 ```
 
@@ -132,6 +136,8 @@ Parameters
 | modelDataDownloadTimeoutInSeconds | Integer | ![Optional](https://img.shields.io/badge/optional-4169E1) | The timeout value, in seconds, to download and extract the model that you want to host from Amazon S3 to the individual inference instance associated with this production variant. |
 | volumeSizeInGb | Integer | ![Optional](https://img.shields.io/badge/optional-4169E1) | The size, in GB, of the ML storage volume attached to individual inference instance associated with the production variant. Currently only Amazon EBS gp2 storage volumes are supported. |
 | asyncInference | AsyncInferenceConfig | ![Optional](https://img.shields.io/badge/optional-4169E1) | Specifies configuration for how an endpoint performs asynchronous inference. Refer to [AsyncInferenceConfig](#asyncinferenceconfig) for details. If not defined, the endpoint will be configured as real-time.|
+| minCapacity | Integer | ![Optional](https://img.shields.io/badge/optional-4169E1) | Specifies the minimum value that Application Auto Scaling can use to scale a target during a scaling activity. |
+| maxCapacity | Integer | ![Optional](https://img.shields.io/badge/optional-4169E1) | Specifies the maximum value that Application Auto Scaling can use to scale a target during a scaling activity. |
 
 ### AsyncInferenceConfig
 
@@ -167,6 +173,8 @@ If defined, the SageMaker endpoint will perform asynchronous inference.
 - startupHealthCheckTimeoutInSeconds: 600 if not provided
 - modelDataDownloadTimeoutInSeconds: 600 if not provided
 - instanceCount: 1 if not provided
+- minCapacity: 1 if not provided
+- maxCapacity: 2 if not provided
 
 If async configuration is enabled:
 - Enable server-side encryption for SNS Topics using AWS managed KMS Key
diff --git a/src/patterns/gen-ai/aws-model-deployment-sagemaker/custom-sagemaker-endpoint.ts b/src/patterns/gen-ai/aws-model-deployment-sagemaker/custom-sagemaker-endpoint.ts
@@ -11,6 +11,7 @@
  *  and limitations under the License.
  */
 import * as cdk from 'aws-cdk-lib';
+import * as applicationautoscaling from 'aws-cdk-lib/aws-applicationautoscaling';
 import * as iam from 'aws-cdk-lib/aws-iam';
 import * as kms from 'aws-cdk-lib/aws-kms';
 import * as sagemaker from 'aws-cdk-lib/aws-sagemaker';
@@ -32,6 +33,8 @@ export interface CustomSageMakerEndpointProps {
   readonly modelId: string;
   readonly endpointName: string;
   readonly instanceType: SageMakerInstanceType;
+  readonly minCapacity?:number;
+  readonly maxCapacity?:number;
   readonly container: ContainerImage;
   readonly instanceCount?: number;
   readonly role?: iam.Role;
@@ -51,6 +54,7 @@ export class CustomSageMakerEndpoint extends SageMakerEndpointBase implements ia
   public readonly cfnModel: sagemaker.CfnModel;
   public readonly cfnEndpoint: sagemaker.CfnEndpoint;
   public readonly cfnEndpointConfig: sagemaker.CfnEndpointConfig;
+  public readonly scalingPolicy: applicationautoscaling.StepScalingPolicy;
   public readonly successTopic?: sns.Topic;
   public readonly errorTopic?: sns.Topic;
 
@@ -123,21 +127,24 @@ export class CustomSageMakerEndpoint extends SageMakerEndpointBase implements ia
       vpcConfig: props.vpcConfig,
     });
 
+    const productionVariant: sagemaker.CfnEndpointConfig.ProductionVariantProperty =
+      {
+        instanceType: this.instanceType.toString(),
+        initialVariantWeight: 1,
+        initialInstanceCount: this.instanceCount,
+        variantName: 'AllTraffic',
+        volumeSizeInGb: props.volumeSizeInGb,
+        modelName: model.getAtt('ModelName').toString(),
+        containerStartupHealthCheckTimeoutInSeconds: this.startupHealthCheckTimeoutInSeconds,
+        modelDataDownloadTimeoutInSeconds: this.modelDataDownloadTimeoutInSeconds,
+      };
+
+
     const endpointConfig = new sagemaker.CfnEndpointConfig(scope, `EndpointConfig-${id}`, {
-      productionVariants: [
-        {
-          instanceType: this.instanceType.toString(),
-          initialVariantWeight: 1,
-          initialInstanceCount: this.instanceCount,
-          variantName: 'AllTraffic',
-          volumeSizeInGb: props.volumeSizeInGb,
-          modelName: model.getAtt('ModelName').toString(),
-          containerStartupHealthCheckTimeoutInSeconds: this.startupHealthCheckTimeoutInSeconds,
-          modelDataDownloadTimeoutInSeconds: this.modelDataDownloadTimeoutInSeconds,
-        },
-      ],
+      productionVariants: [productionVariant],
     });
 
+
     if (props.asyncInference) {
 
       // build sns topics for success and failure
@@ -180,10 +187,12 @@ export class CustomSageMakerEndpoint extends SageMakerEndpointBase implements ia
 
     endpoint.addDependency(endpointConfig);
 
+
     this.cfnModel = model;
     this.cfnEndpoint = endpoint;
     this.cfnEndpointConfig = endpointConfig;
     this.endpointArn = endpoint.ref;
+    this.scalingPolicy = this.buildScalingPolicy(endpoint, productionVariant, props );
   }
 
   public addToRolePolicy(statement: iam.PolicyStatement) {
@@ -202,6 +211,56 @@ export class CustomSageMakerEndpoint extends SageMakerEndpointBase implements ia
     });
   }
 
+  private buildScalingPolicy(
+    endpoint: sagemaker.CfnEndpoint,
+    productionVariants: sagemaker.CfnEndpointConfig.ProductionVariantProperty,
+    props: CustomSageMakerEndpointProps): applicationautoscaling.StepScalingPolicy {
+    const resourceId = `endpoint/${endpoint.attrEndpointName}/variant/${productionVariants.variantName}`;
+
+    const scalableTarget = new applicationautoscaling.ScalableTarget(
+      this,
+      'ScalableTarget',
+      {
+        serviceNamespace: applicationautoscaling.ServiceNamespace.SAGEMAKER,
+        resourceId: resourceId,
+        scalableDimension: 'sagemaker:variant:DesiredInstanceCount',
+        minCapacity: props.minCapacity ?? 1,
+        maxCapacity: props.maxCapacity ?? 2,
+      },
+    );
+    scalableTarget.node.addDependency(endpoint);
+
+    const approximateBacklogMetric = new cdk.aws_cloudwatch.Metric({
+      namespace: 'AWS/SageMaker',
+      metricName: 'ApproximateBacklogSizePerInstance',
+      dimensionsMap: {
+        Endpoint: endpoint.attrEndpointName,
+        Variant: productionVariants.variantName,
+      },
+      statistic: 'Average',
+      period: cdk.Duration.minutes(5),
+    });
+
+    const scalingPolicy = new applicationautoscaling.StepScalingPolicy(
+      this,
+      'ScalingPolicy',
+      {
+        scalingTarget: scalableTarget,
+        adjustmentType: applicationautoscaling.AdjustmentType.CHANGE_IN_CAPACITY,
+        metric: approximateBacklogMetric,
+        scalingSteps: [
+          { upper: 0, change: -1, lower: 0 },
+          { change: 1, lower: 0.5 },
+        ],
+        cooldown: cdk.Duration.minutes(5),
+        datapointsToAlarm: 1,
+        evaluationPeriods: 1,
+      },
+    );
+
+    return scalingPolicy;
+  }
+
   private buildSnsTopic(topicName: string, displayName: string): sns.Topic {
     const masterKey = kms.Alias.fromAliasName(this, `aws-managed-key-${topicName}`, 'alias/aws/sns');
 
diff --git a/test/patterns/gen-ai/aws-model-deployment-sagemaker/aws-custom-sagemaker-endpoint.test.ts b/test/patterns/gen-ai/aws-model-deployment-sagemaker/aws-custom-sagemaker-endpoint.test.ts
@@ -50,6 +50,8 @@ describe('CustomSageMakerEndpoint construct no async', () => {
       endpointName: 'testbgebase',
       instanceCount: 1,
       volumeSizeInGb: 100,
+      minCapacity: 1,
+      maxCapacity: 2,
     });
     CstTestTemplate = Template.fromStack(CstTestStack);