Skip to content

Commit 759e692

Browse files
authored
chore(glue-alpha): refactor spark jobs code-related props handling (#33567)
### Issue # (if applicable) Related to #33356 ### Reason for this change address the feedback section of **extraJars, extraFiles, extraPythonFiles, extraJarsFirst** + refactor to reduce duplicative code across the different spark job classes ### Description of changes - update the different spark jobs props to - make `extraPythonFiles`, `extraFiles`, `extraJars`, and `extraJarsFirst` available in pyspark jobs - make `extraFiles`, `extraJars`, and `extraJarsFirst` available in scala spark jobs - introduce a base `SparkJob` class and `SparkJobProps` to reduce duplicate logic across different spark job classes - introduce `setupSparkCodeArguments` `protected` method along with an interface `SparkExtraCodeProps` to handle setting up `Code`-related arguments - fix spark ui setup to be consistent with docs for `sparkUI` prop - merge content of `spark-ui-utils.ts` into `spark-job.ts` and retire it - cleanup comma expression for role setup - fix typos and do renames to follow conventions ### Describe any new or updated permissions being added N/A ### Description of how you validated changes updated unit tests ### Checklist - [x] My code adheres to the [CONTRIBUTING GUIDE](https://github.com/aws/aws-cdk/blob/main/CONTRIBUTING.md) and [DESIGN GUIDELINES](https://github.com/aws/aws-cdk/blob/main/docs/DESIGN_GUIDELINES.md) ---- *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
1 parent 88934f2 commit 759e692

20 files changed

+509
-802
lines changed

packages/@aws-cdk/aws-glue-alpha/lib/index.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ export * from './security-configuration';
1212
export * from './storage-parameter';
1313
export * from './constants';
1414
export * from './jobs/job';
15+
export * from './jobs/spark-job';
1516
export * from './jobs/pyspark-etl-job';
1617
export * from './jobs/pyspark-flex-etl-job';
1718
export * from './jobs/pyspark-streaming-job';
@@ -20,7 +21,6 @@ export * from './jobs/ray-job';
2021
export * from './jobs/scala-spark-etl-job';
2122
export * from './jobs/scala-spark-flex-etl-job';
2223
export * from './jobs/scala-spark-streaming-job';
23-
export * from './jobs/spark-ui-utils';
2424
export * from './table-base';
2525
export * from './table-deprecated';
2626
export * from './triggers/workflow';

packages/@aws-cdk/aws-glue-alpha/lib/jobs/job.ts

+12-12
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import * as iam from 'aws-cdk-lib/aws-iam';
44
import * as logs from 'aws-cdk-lib/aws-logs';
55
import * as cdk from 'aws-cdk-lib/core';
66
import * as constructs from 'constructs';
7-
import { Code } from '..';
7+
import { Code } from '../code';
88
import { MetricType, JobState, WorkerType, GlueVersion } from '../constants';
99
import { IConnection } from '../connection';
1010
import { ISecurityConfiguration } from '../security-configuration';
@@ -88,7 +88,7 @@ export interface IJob extends cdk.IResource, iam.IGrantable {
8888
*/
8989
export interface ContinuousLoggingProps {
9090
/**
91-
* Enable continouous logging.
91+
* Enable continuous logging.
9292
*/
9393
readonly enabled: boolean;
9494

@@ -126,8 +126,8 @@ export interface ContinuousLoggingProps {
126126
/**
127127
* A base class is needed to be able to import existing Jobs into a CDK app to
128128
* reference as part of a larger stack or construct. JobBase has the subset
129-
* of attribtues required to idenitfy and reference an existing Glue Job,
130-
* as well as some CloudWatch metric conveneince functions to configure an
129+
* of attributes required to identify and reference an existing Glue Job,
130+
* as well as some CloudWatch metric convenience functions to configure an
131131
* event-driven flow using the job.
132132
*/
133133
export abstract class JobBase extends cdk.Resource implements IJob {
@@ -282,10 +282,10 @@ export abstract class JobBase extends cdk.Resource implements IJob {
282282

283283
/**
284284
* A subset of Job attributes are required for importing an existing job
285-
* into a CDK project. This is ionly used when using fromJobAttributes
285+
* into a CDK project. This is only used when using fromJobAttributes
286286
* to identify and reference the existing job.
287287
*/
288-
export interface JobImportAttributes {
288+
export interface JobAttributes {
289289
/**
290290
* The name of the job.
291291
*/
@@ -301,9 +301,9 @@ export interface JobImportAttributes {
301301
}
302302

303303
/**
304-
* JobProperties will be used to create new Glue Jobs using this L2 Construct.
304+
* JobProps will be used to create new Glue Jobs using this L2 Construct.
305305
*/
306-
export interface JobProperties {
306+
export interface JobProps {
307307
/**
308308
* Script Code Location (required)
309309
* Script to run when the Glue job executes. Can be uploaded
@@ -463,7 +463,7 @@ export abstract class Job extends JobBase {
463463
* @param id The construct's id.
464464
* @param attrs Attributes for the Glue Job we want to import
465465
*/
466-
public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobImportAttributes): IJob {
466+
public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobAttributes): IJob {
467467
class Import extends JobBase {
468468
public readonly jobName = attrs.jobName;
469469
public readonly jobArn = this.buildJobArn(scope, attrs.jobName);
@@ -496,12 +496,12 @@ export abstract class Job extends JobBase {
496496
}
497497

498498
/**
499-
* Setup Continuous Loggiung Properties
499+
* Setup Continuous Logging Properties
500500
* @param role The IAM role to use for continuous logging
501501
* @param props The properties for continuous logging configuration
502502
* @returns String containing the args for the continuous logging command
503503
*/
504-
public setupContinuousLogging(role: iam.IRole, props: ContinuousLoggingProps | undefined) : any {
504+
protected setupContinuousLogging(role: iam.IRole, props: ContinuousLoggingProps | undefined) : any {
505505
// If the developer has explicitly disabled continuous logging return no args
506506
if (props && !props.enabled) {
507507
return {};
@@ -552,7 +552,7 @@ function metricRule(rule: events.IRule, props?: cloudwatch.MetricOptions): cloud
552552
namespace: 'AWS/Events',
553553
metricName: 'TriggeredRules',
554554
dimensionsMap: { RuleName: rule.ruleName },
555-
statistic: cloudwatch.Statistic.SUM,
555+
statistic: cloudwatch.Stats.SUM,
556556
...props,
557557
}).attachTo(rule);
558558
}
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,13 @@
1-
import * as iam from 'aws-cdk-lib/aws-iam';
2-
import { Bucket, BucketEncryption } from 'aws-cdk-lib/aws-s3';
31
import { CfnJob } from 'aws-cdk-lib/aws-glue';
4-
import { Job, JobProperties } from './job';
52
import { Construct } from 'constructs';
63
import { JobType, GlueVersion, JobLanguage, PythonVersion, WorkerType } from '../constants';
7-
import { SparkUIProps, SparkUILoggingLocation, validateSparkUiPrefix, cleanSparkUiPrefixForGrant } from './spark-ui-utils';
84
import { Code } from '../code';
9-
import { addConstructMetadata } from 'aws-cdk-lib/core/lib/metadata-resource';
5+
import { SparkJob, SparkJobProps } from './spark-job';
106

117
/**
128
* Properties for creating a Python Spark ETL job
139
*/
14-
export interface PySparkEtlJobProps extends JobProperties {
15-
/**
16-
* Enables the Spark UI debugging and monitoring with the specified props.
17-
*
18-
* @default - Spark UI debugging and monitoring is disabled.
19-
*
20-
* @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html
21-
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
22-
*/
23-
readonly sparkUI?: SparkUIProps;
24-
10+
export interface PySparkEtlJobProps extends SparkJobProps {
2511
/**
2612
* Extra Python Files S3 URL (optional)
2713
* S3 URL where additional python dependencies are located
@@ -46,6 +32,15 @@ export interface PySparkEtlJobProps extends JobProperties {
4632
*/
4733
readonly extraJars?: Code[];
4834

35+
/**
36+
* Setting this value to true prioritizes the customer's extra JAR files in the classpath.
37+
*
38+
* @default false - priority is not given to user-provided jars
39+
*
40+
* @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
41+
*/
42+
readonly extraJarsFirst?: boolean;
43+
4944
/**
5045
* Specifies whether job run queuing is enabled for the job runs for this job.
5146
* A value of true means job run queuing is enabled for the job runs.
@@ -71,57 +66,20 @@ export interface PySparkEtlJobProps extends JobProperties {
7166
* You can find more details about version, worker type and other features
7267
* in Glue's public documentation.
7368
*/
74-
export class PySparkEtlJob extends Job {
69+
export class PySparkEtlJob extends SparkJob {
7570
public readonly jobArn: string;
7671
public readonly jobName: string;
77-
public readonly role: iam.IRole;
78-
public readonly grantPrincipal: iam.IPrincipal;
79-
80-
/**
81-
* The Spark UI logs location if Spark UI monitoring and debugging is enabled.
82-
*
83-
* @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html
84-
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
85-
*/
86-
public readonly sparkUILoggingLocation?: SparkUILoggingLocation;
8772

8873
/**
8974
* PySparkEtlJob constructor
9075
*/
9176
constructor(scope: Construct, id: string, props: PySparkEtlJobProps) {
92-
super(scope, id, {
93-
physicalName: props.jobName,
94-
});
95-
// Enhanced CDK Analytics Telemetry
96-
addConstructMetadata(this, props);
97-
98-
// Set up role and permissions for principal
99-
this.role = props.role, {
100-
assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'),
101-
managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')],
102-
};
103-
this.grantPrincipal = this.role;
77+
super(scope, id, props);
10478

105-
// Enable SparkUI by default as a best practice
106-
const sparkUIArgs = props.sparkUI?.bucket ? this.setupSparkUI(this.role, props.sparkUI) : undefined;
107-
this.sparkUILoggingLocation = sparkUIArgs?.location;
108-
109-
// Enable CloudWatch metrics and continuous logging by default as a best practice
110-
const continuousLoggingArgs = this.setupContinuousLogging(this.role, props.continuousLogging);
111-
const profilingMetricsArgs = { '--enable-metrics': '' };
112-
const observabilityMetricsArgs = { '--enable-observability-metrics': 'true' };
113-
114-
// Gather executable arguments
115-
const execuatbleArgs = this.executableArguments(props);
116-
117-
// Conbine command line arguments into a single line item
79+
// Combine command line arguments into a single line item
11880
const defaultArguments = {
119-
...execuatbleArgs,
120-
...continuousLoggingArgs,
121-
...profilingMetricsArgs,
122-
...observabilityMetricsArgs,
123-
...sparkUIArgs?.args,
124-
...this.checkNoReservedArgs(props.defaultArguments),
81+
...this.executableArguments(props),
82+
...this.nonExecutableCommonArguments(props),
12583
};
12684

12785
const jobResource = new CfnJob(this, 'Resource', {
@@ -159,35 +117,7 @@ export class PySparkEtlJob extends Job {
159117
private executableArguments(props: PySparkEtlJobProps) {
160118
const args: { [key: string]: string } = {};
161119
args['--job-language'] = JobLanguage.PYTHON;
162-
163-
if (props.extraPythonFiles && props.extraPythonFiles.length > 0) {
164-
args['--extra-py-files'] = props.extraPythonFiles.map(code => this.codeS3ObjectUrl(code)).join(',');
165-
}
166-
if (props.extraFiles && props.extraFiles.length > 0) {
167-
args['--extra-files'] = props.extraFiles.map(code => this.codeS3ObjectUrl(code)).join(',');
168-
}
169-
if (props.extraJars && props.extraJars?.length > 0) {
170-
args['--extra-jars'] = props.extraJars.map(code => this.codeS3ObjectUrl(code)).join(',');
171-
}
172-
120+
this.setupExtraCodeArguments(args, props);
173121
return args;
174122
}
175-
176-
private setupSparkUI(role: iam.IRole, sparkUiProps: SparkUIProps) {
177-
validateSparkUiPrefix(sparkUiProps.prefix);
178-
const bucket = sparkUiProps.bucket ?? new Bucket(this, 'SparkUIBucket', { enforceSSL: true, encryption: BucketEncryption.S3_MANAGED });
179-
bucket.grantReadWrite(role, cleanSparkUiPrefixForGrant(sparkUiProps.prefix));
180-
const args = {
181-
'--enable-spark-ui': 'true',
182-
'--spark-event-logs-path': bucket.s3UrlForObject(sparkUiProps.prefix).replace(/\/?$/, '/'), // path will always end with a slash
183-
};
184-
185-
return {
186-
location: {
187-
prefix: sparkUiProps.prefix,
188-
bucket,
189-
},
190-
args,
191-
};
192-
}
193123
}

0 commit comments

Comments
 (0)