Skip to content

Commit 6ffe634

Browse files
Andrew Kenworthyrazvanadwk67
committed
Add config maps (#50)
## Description Make inline spark properties available via ConfigMap. Co-authored-by: Razvan-Daniel Mihai <[email protected]> Co-authored-by: Andrew Kenworthy <[email protected]>
1 parent b6ccc79 commit 6ffe634

20 files changed

+5391
-5
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ All notable changes to this project will be documented in this file.
99
- Initial commit
1010
- ServiceAccount, ClusterRole and RoleBinding for Spark driver ([#39])
1111
- S3 credentials can be provided via a Secret ([#42])
12+
- Job information can be passed via a configuration map ([#50])
1213

1314
[#39]: https://github.com/stackabletech/spark-k8s-operator/pull/39
1415
[#42]: https://github.com/stackabletech/spark-k8s-operator/pull/42
16+
[#50]: https://github.com/stackabletech/spark-k8s-operator/pull/50
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: cm-job-arguments # <1>
6+
data:
7+
job-args.txt: |
8+
s3a://nyc-tlc/trip data/yellow_tripdata_2021-07.csv # <2>
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
---
2+
apiVersion: spark.stackable.tech/v1alpha1
3+
kind: SparkApplication
4+
metadata:
5+
name: ny-tlc-report-configmap
6+
namespace: default
7+
spec:
8+
version: "1.0"
9+
sparkImage: docker.stackable.tech/stackable/spark-k8s:3.2.1-hadoop3.2-stackable0.4.0
10+
mode: cluster
11+
mainApplicationFile: s3a://stackable-spark-k8s-jars/jobs/ny-tlc-report-1.1.0.jar # <3>
12+
mainClass: tech.stackable.demo.spark.NYTLCReport
13+
volumes:
14+
- name: job-deps
15+
persistentVolumeClaim:
16+
claimName: pvc-ksv
17+
- name: cm-job-arguments
18+
configMap:
19+
name: cm-job-arguments # <4>
20+
args:
21+
- "--input /arguments/job-args.txt" # <5>
22+
sparkConf:
23+
"spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
24+
"spark.driver.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
25+
"spark.executor.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
26+
driver:
27+
cores: 1
28+
coreLimit: "1200m"
29+
memory: "512m"
30+
volumeMounts:
31+
- name: job-deps
32+
mountPath: /dependencies
33+
- name: cm-job-arguments # <6>
34+
mountPath: /arguments # <7>
35+
executor:
36+
cores: 1
37+
instances: 3
38+
memory: "512m"
39+
volumeMounts:
40+
- name: job-deps
41+
mountPath: /dependencies
42+
- name: cm-job-arguments # <6>
43+
mountPath: /arguments # <7>

docs/modules/ROOT/pages/rbac.adoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ then the cluster-role has to be created assigned to the service account manually
3232
[source,bash]
3333
----
3434
kubectl create clusterrolebinding spark-role --clusterrole=spark-driver-edit-role --serviceaccount=default:default
35-
----
35+
----

docs/modules/ROOT/pages/usage.adoc

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,15 +92,32 @@ include::example$example-sparkapp-pvc.yaml[]
9292
include::example$example-sparkapp-s3-private.yaml[]
9393
----
9494

95-
<1> Job python artifact (local)
95+
<1> Job python artifact (located in S3)
9696
<2> Artifact class
97-
<3> S3 section, specifying the existing secret and S3 end-point ( in this case, Min-IO)
97+
<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO)
9898
<4> Credentials secret
9999
<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources...
100100
<6> ...in this case, in s3, accessed with the credentials defined in the secret
101101
<7> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing
102102
<8> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors
103103

104+
=== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map
105+
106+
[source,yaml]
107+
----
108+
include::example$example-configmap.yaml[]
109+
----
110+
[source,yaml]
111+
----
112+
include::example$example-sparkapp-configmap.yaml[]
113+
----
114+
<1> Name of the configuration map
115+
<2> Argument required by the job
116+
<3> Job scala artifact that requires an input argument
117+
<4> The volume backed by the configuration map
118+
<5> The expected job argument, accessed via the mounted configuration map file
119+
<6> The name of the volume backed by the configuration map that will be mounted to the driver/executor
120+
<7> The mount location of the volume (this will contain a file `/arguments/job-args.txt`)
104121

105122
== CRD argument coverage
106123

@@ -205,3 +222,4 @@ Below are listed the CRD fields that can be defined by the user:
205222
|`spec.executor.volumeMounts.mountPath`
206223
|Volume mount path
207224
|===
225+

examples/ny-tlc-report-configmap.yaml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
---
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: cm-job-arguments
6+
data:
7+
job-args.txt: |
8+
s3a://nyc-tlc/trip data/yellow_tripdata_2021-07.csv
9+
---
10+
apiVersion: spark.stackable.tech/v1alpha1
11+
kind: SparkApplication
12+
metadata:
13+
name: ny-tlc-report-configmap
14+
namespace: default
15+
spec:
16+
version: "1.0"
17+
sparkImage: docker.stackable.tech/stackable/spark-k8s:3.2.1-hadoop3.2-stackable0.4.0
18+
mode: cluster
19+
mainApplicationFile: s3a://stackable-spark-k8s-jars/jobs/ny-tlc-report-1.1.0.jar
20+
mainClass: tech.stackable.demo.spark.NYTLCReport
21+
volumes:
22+
- name: job-deps
23+
persistentVolumeClaim:
24+
claimName: pvc-ksv
25+
- name: cm-job-arguments
26+
configMap:
27+
name: cm-job-arguments
28+
args:
29+
- "--input /arguments/job-args.txt"
30+
sparkConf:
31+
"spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
32+
"spark.driver.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
33+
"spark.executor.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
34+
driver:
35+
cores: 1
36+
coreLimit: "1200m"
37+
memory: "512m"
38+
volumeMounts:
39+
- name: job-deps
40+
mountPath: /dependencies
41+
- name: cm-job-arguments
42+
mountPath: /arguments
43+
executor:
44+
cores: 1
45+
instances: 3
46+
memory: "512m"
47+
volumeMounts:
48+
- name: job-deps
49+
mountPath: /dependencies
50+
- name: cm-job-arguments
51+
mountPath: /arguments

rust/operator-binary/src/spark_k8s_controller.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,14 @@ fn pod_template(
164164
volume_mounts: &[VolumeMount],
165165
env: &[EnvVar],
166166
) -> Result<Pod> {
167+
let volumes = volumes.to_vec();
168+
let volume_mounts = volume_mounts.to_vec();
169+
167170
let mut container = ContainerBuilder::new(container_name);
168171
container
169-
.add_volume_mounts(volume_mounts.to_vec())
172+
.add_volume_mounts(volume_mounts)
170173
.add_env_vars(env.to_vec());
174+
171175
if job_container.is_some() {
172176
container.add_volume_mount(VOLUME_MOUNT_NAME_JOB, VOLUME_MOUNT_PATH_JOB);
173177
}
@@ -185,7 +189,7 @@ fn pod_template(
185189
template
186190
.metadata_default()
187191
.add_container(container.build())
188-
.add_volumes(volumes.to_vec());
192+
.add_volumes(volumes);
189193

190194
if let Some(container) = requirements_container.clone() {
191195
template.add_init_container(container);
@@ -299,6 +303,7 @@ fn spark_job(
299303
..Volume::default()
300304
}];
301305
volumes.extend(spark_application.volumes());
306+
302307
if job_container.is_some() {
303308
volumes.push(Volume {
304309
name: String::from(VOLUME_MOUNT_NAME_JOB),
@@ -393,6 +398,7 @@ mod tests {
393398
use crate::spark_k8s_controller::spark_job;
394399
use crate::spark_k8s_controller::{build_spark_role_serviceaccount, pod_template_config_map};
395400
use crate::SparkApplication;
401+
use std::collections::BTreeMap;
396402

397403
#[test]
398404
fn test_pod_config_map() {
@@ -501,4 +507,16 @@ spec:
501507
job.metadata.owner_references.map(|r| r[0].uid.to_string())
502508
);
503509
}
510+
511+
#[test]
512+
fn test_cast() {
513+
let properties = serde_yaml::from_str::<BTreeMap<String, String>>(
514+
r#"
515+
spark.hadoop.fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider
516+
spark.driver.extraClassPath: /dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar
517+
spark.executor.extraClassPath: /dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar
518+
"#,
519+
);
520+
assert_eq!(3, properties.unwrap().len());
521+
}
504522
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestAssert
4+
metadata:
5+
name: minio
6+
timeout: 300
7+
---
8+
apiVersion: v1
9+
kind: Service
10+
metadata:
11+
name: test-minio
12+
labels:
13+
app: minio
14+
---
15+
apiVersion: apps/v1
16+
kind: StatefulSet
17+
metadata:
18+
name: minio-mc
19+
status:
20+
readyReplicas: 1
21+
replicas: 1
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
apiVersion: apps/v1
3+
kind: StatefulSet
4+
metadata:
5+
name: minio-mc
6+
labels:
7+
app: minio-mc
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: minio-mc
13+
template:
14+
metadata:
15+
labels:
16+
app: minio-mc
17+
spec:
18+
containers:
19+
- name: minio-mc
20+
image: bitnami/minio:2022-debian-10
21+
command: ["/bin/sh"]
22+
stdin: true
23+
tty: true
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestStep
4+
commands:
5+
- script: >-
6+
helm install test-minio
7+
--namespace $NAMESPACE
8+
--set mode=standalone
9+
--set replicas=1
10+
--set persistence.enabled=false
11+
--set buckets[0].name=my-bucket,buckets[0].policy=public
12+
--repo https://charts.min.io/ minio
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestStep
4+
commands:
5+
- script: >-
6+
kubectl exec -n $NAMESPACE minio-mc-0 --
7+
sh -c 'mc alias set test-minio http://test-minio:9000/'
8+
- script: kubectl cp -n $NAMESPACE ny-tlc-report-1.1.0.jar minio-mc-0:/tmp
9+
- script: kubectl cp -n $NAMESPACE yellow_tripdata_2021-07.csv minio-mc-0:/tmp
10+
- script: >-
11+
kubectl exec -n $NAMESPACE minio-mc-0 --
12+
mc cp /tmp/ny-tlc-report-1.1.0.jar test-minio/my-bucket
13+
- script: >-
14+
kubectl exec -n $NAMESPACE minio-mc-0 --
15+
mc cp /tmp/yellow_tripdata_2021-07.csv test-minio/my-bucket
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestAssert
4+
metadata:
5+
name: spark-ny-deps-job
6+
timeout: 300
7+
---
8+
apiVersion: batch/v1
9+
kind: Job
10+
metadata:
11+
name: spark-ny-deps-job
12+
status:
13+
succeeded: 1
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
---
2+
apiVersion: v1
3+
kind: PersistentVolumeClaim
4+
metadata:
5+
name: spark-ny-pvc
6+
spec:
7+
accessModes:
8+
- ReadWriteOnce
9+
resources:
10+
requests:
11+
storage: 1Gi
12+
---
13+
apiVersion: batch/v1
14+
kind: Job
15+
metadata:
16+
name: spark-ny-deps-job
17+
spec:
18+
template:
19+
spec:
20+
restartPolicy: Never
21+
volumes:
22+
- name: job-deps
23+
persistentVolumeClaim:
24+
claimName: spark-ny-pvc
25+
containers:
26+
- name: aws-deps
27+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0
28+
env:
29+
- name: DEST_DIR
30+
value: "/dependencies/jars"
31+
- name: AWS
32+
value: "1.11.375"
33+
- name: HADOOP
34+
value: "3.2"
35+
command:
36+
[
37+
"bash",
38+
"-x",
39+
"-o",
40+
"pipefail",
41+
"-c",
42+
"mkdir -p ${DEST_DIR} && curl -L https://search.maven.org/remotecontent?filepath=org/apache/hadoop/hadoop-aws/${HADOOP}.0/hadoop-aws-${HADOOP}.0.jar -o ${DEST_DIR}/hadoop-aws-${HADOOP}.0.jar && curl -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS}/aws-java-sdk-bundle-${AWS}.jar -o ${DEST_DIR}/aws-java-sdk-bundle-${AWS}.jar && chown -R stackable:stackable ${DEST_DIR} && chmod -R a=,u=rwX ${DEST_DIR}",
43+
]
44+
volumeMounts:
45+
- name: job-deps
46+
mountPath: /dependencies
47+
securityContext:
48+
runAsUser: 0
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: list-jars-job
6+
status:
7+
succeeded: 1

0 commit comments

Comments
 (0)