Add config maps (#50)

Andrew Kenworthy · razvan · adwk67 · Andrew Kenworthy · commit 6ffe634c121a · 2022-04-19T14:03:32.000Z
## Description

Make inline spark properties available via ConfigMap.



Co-authored-by: Razvan-Daniel Mihai &lt;84674+razvan@users.noreply.github.com&gt;
Co-authored-by: Andrew Kenworthy &lt;andrew.kenworthy@stackable.de&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ All notable changes to this project will be documented in this file.
 - Initial commit
 - ServiceAccount, ClusterRole and RoleBinding for Spark driver ([#39])
 - S3 credentials can be provided via a Secret ([#42])
+- Job information can be passed via a configuration map ([#50])
 
 [#39]: https://github.com/stackabletech/spark-k8s-operator/pull/39
 [#42]: https://github.com/stackabletech/spark-k8s-operator/pull/42
+[#50]: https://github.com/stackabletech/spark-k8s-operator/pull/50
diff --git a/docs/modules/ROOT/examples/example-configmap.yaml b/docs/modules/ROOT/examples/example-configmap.yaml
@@ -0,0 +1,8 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cm-job-arguments # <1>
+data:
+  job-args.txt: |
+    s3a://nyc-tlc/trip data/yellow_tripdata_2021-07.csv # <2>
diff --git a/docs/modules/ROOT/examples/example-sparkapp-configmap.yaml b/docs/modules/ROOT/examples/example-sparkapp-configmap.yaml
@@ -0,0 +1,43 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: ny-tlc-report-configmap
+  namespace: default
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/spark-k8s:3.2.1-hadoop3.2-stackable0.4.0
+  mode: cluster
+  mainApplicationFile: s3a://stackable-spark-k8s-jars/jobs/ny-tlc-report-1.1.0.jar # <3>
+  mainClass: tech.stackable.demo.spark.NYTLCReport
+  volumes:
+    - name: job-deps
+      persistentVolumeClaim:
+        claimName: pvc-ksv
+    - name: cm-job-arguments
+      configMap:
+        name: cm-job-arguments # <4>
+  args:
+    - "--input /arguments/job-args.txt" # <5>
+  sparkConf:
+    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
+    "spark.driver.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+    "spark.executor.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+      - name: cm-job-arguments # <6>
+        mountPath: /arguments  # <7>
+  executor:
+    cores: 1
+    instances: 3
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+      - name: cm-job-arguments # <6>
+        mountPath: /arguments # <7>
diff --git a/docs/modules/ROOT/pages/rbac.adoc b/docs/modules/ROOT/pages/rbac.adoc
@@ -32,4 +32,4 @@ then the cluster-role has to be created assigned to the service account manually
 [source,bash]
 ----
 kubectl create clusterrolebinding spark-role --clusterrole=spark-driver-edit-role  --serviceaccount=default:default
-----
+----
diff --git a/docs/modules/ROOT/pages/usage.adoc b/docs/modules/ROOT/pages/usage.adoc
@@ -92,15 +92,32 @@ include::example$example-sparkapp-pvc.yaml[]
 include::example$example-sparkapp-s3-private.yaml[]
 ----
 
-<1> Job python artifact (local)
+<1> Job python artifact (located in S3)
 <2> Artifact class
-<3> S3 section, specifying the existing secret and S3 end-point ( in this case, Min-IO)
+<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO)
 <4> Credentials secret
 <5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources...
 <6> ...in this case, in s3, accessed with the credentials defined in the secret
 <7> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing
 <8> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors
 
+=== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map
+
+[source,yaml]
+----
+include::example$example-configmap.yaml[]
+----
+[source,yaml]
+----
+include::example$example-sparkapp-configmap.yaml[]
+----
+<1> Name of the configuration map
+<2> Argument required by the job
+<3> Job scala artifact that requires an input argument
+<4> The volume backed by the configuration map
+<5> The expected job argument, accessed via the mounted configuration map file
+<6> The name of the volume backed by the configuration map that will be mounted to the driver/executor
+<7> The mount location of the volume (this will contain a file `/arguments/job-args.txt`)
 
 == CRD argument coverage
 
@@ -205,3 +222,4 @@ Below are listed the CRD fields that can be defined by the user:
 |`spec.executor.volumeMounts.mountPath`
 |Volume mount path
 |===
+
diff --git a/examples/ny-tlc-report-configmap.yaml b/examples/ny-tlc-report-configmap.yaml
@@ -0,0 +1,51 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cm-job-arguments
+data:
+  job-args.txt: |
+    s3a://nyc-tlc/trip data/yellow_tripdata_2021-07.csv
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: ny-tlc-report-configmap
+  namespace: default
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/spark-k8s:3.2.1-hadoop3.2-stackable0.4.0
+  mode: cluster
+  mainApplicationFile: s3a://stackable-spark-k8s-jars/jobs/ny-tlc-report-1.1.0.jar
+  mainClass: tech.stackable.demo.spark.NYTLCReport
+  volumes:
+    - name: job-deps
+      persistentVolumeClaim:
+        claimName: pvc-ksv
+    - name: cm-job-arguments
+      configMap:
+        name: cm-job-arguments
+  args:
+    - "--input /arguments/job-args.txt"
+  sparkConf:
+    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
+    "spark.driver.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+    "spark.executor.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+      - name: cm-job-arguments
+        mountPath: /arguments
+  executor:
+    cores: 1
+    instances: 3
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+      - name: cm-job-arguments
+        mountPath: /arguments
diff --git a/rust/operator-binary/src/spark_k8s_controller.rs b/rust/operator-binary/src/spark_k8s_controller.rs
@@ -164,10 +164,14 @@ fn pod_template(
     volume_mounts: &[VolumeMount],
     env: &[EnvVar],
 ) -> Result<Pod> {
+    let volumes = volumes.to_vec();
+    let volume_mounts = volume_mounts.to_vec();
+
     let mut container = ContainerBuilder::new(container_name);
     container
-        .add_volume_mounts(volume_mounts.to_vec())
+        .add_volume_mounts(volume_mounts)
         .add_env_vars(env.to_vec());
+
     if job_container.is_some() {
         container.add_volume_mount(VOLUME_MOUNT_NAME_JOB, VOLUME_MOUNT_PATH_JOB);
     }
@@ -185,7 +189,7 @@ fn pod_template(
     template
         .metadata_default()
         .add_container(container.build())
-        .add_volumes(volumes.to_vec());
+        .add_volumes(volumes);
 
     if let Some(container) = requirements_container.clone() {
         template.add_init_container(container);
@@ -299,6 +303,7 @@ fn spark_job(
         ..Volume::default()
     }];
     volumes.extend(spark_application.volumes());
+
     if job_container.is_some() {
         volumes.push(Volume {
             name: String::from(VOLUME_MOUNT_NAME_JOB),
@@ -393,6 +398,7 @@ mod tests {
     use crate::spark_k8s_controller::spark_job;
     use crate::spark_k8s_controller::{build_spark_role_serviceaccount, pod_template_config_map};
     use crate::SparkApplication;
+    use std::collections::BTreeMap;
 
     #[test]
     fn test_pod_config_map() {
@@ -501,4 +507,16 @@ spec:
             job.metadata.owner_references.map(|r| r[0].uid.to_string())
         );
     }
+
+    #[test]
+    fn test_cast() {
+        let properties = serde_yaml::from_str::<BTreeMap<String, String>>(
+            r#"
+    spark.hadoop.fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider
+    spark.driver.extraClassPath: /dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar
+    spark.executor.extraClassPath: /dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar
+        "#,
+        );
+        assert_eq!(3, properties.unwrap().len());
+    }
 }
diff --git a/tests/templates/kuttl/spark-ny-public-s3/00-assert.yaml b/tests/templates/kuttl/spark-ny-public-s3/00-assert.yaml
@@ -0,0 +1,21 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+metadata:
+  name: minio
+timeout: 300
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: test-minio
+  labels:
+    app: minio
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: minio-mc
+status:
+  readyReplicas: 1
+  replicas: 1
diff --git a/tests/templates/kuttl/spark-ny-public-s3/00-s3-upload-container.yaml b/tests/templates/kuttl/spark-ny-public-s3/00-s3-upload-container.yaml
@@ -0,0 +1,23 @@
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: minio-mc
+  labels:
+    app: minio-mc
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: minio-mc
+  template:
+    metadata:
+      labels:
+        app: minio-mc
+    spec:
+      containers:
+        - name: minio-mc
+          image: bitnami/minio:2022-debian-10
+          command: ["/bin/sh"]
+          stdin: true
+          tty: true
diff --git a/tests/templates/kuttl/spark-ny-public-s3/00-setup-minio.yaml b/tests/templates/kuttl/spark-ny-public-s3/00-setup-minio.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+commands:
+  - script: >-
+      helm install test-minio
+      --namespace $NAMESPACE
+      --set mode=standalone
+      --set replicas=1
+      --set persistence.enabled=false
+      --set buckets[0].name=my-bucket,buckets[0].policy=public
+      --repo https://charts.min.io/ minio
diff --git a/tests/templates/kuttl/spark-ny-public-s3/01-prepare-bucket.yaml b/tests/templates/kuttl/spark-ny-public-s3/01-prepare-bucket.yaml
@@ -0,0 +1,15 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+commands:
+      - script: >-
+              kubectl exec -n $NAMESPACE minio-mc-0 --
+              sh -c 'mc alias set test-minio http://test-minio:9000/'
+      - script: kubectl cp -n $NAMESPACE ny-tlc-report-1.1.0.jar  minio-mc-0:/tmp
+      - script: kubectl cp -n $NAMESPACE yellow_tripdata_2021-07.csv  minio-mc-0:/tmp
+      - script: >-
+              kubectl exec -n $NAMESPACE minio-mc-0 --
+              mc cp /tmp/ny-tlc-report-1.1.0.jar test-minio/my-bucket
+      - script: >-
+              kubectl exec -n $NAMESPACE minio-mc-0 --
+              mc cp /tmp/yellow_tripdata_2021-07.csv test-minio/my-bucket
diff --git a/tests/templates/kuttl/spark-ny-public-s3/02-assert.yaml b/tests/templates/kuttl/spark-ny-public-s3/02-assert.yaml
@@ -0,0 +1,13 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+metadata:
+  name: spark-ny-deps-job
+timeout: 300
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: spark-ny-deps-job
+status:
+  succeeded: 1
diff --git a/tests/templates/kuttl/spark-ny-public-s3/02-deps-volume.yaml b/tests/templates/kuttl/spark-ny-public-s3/02-deps-volume.yaml
@@ -0,0 +1,48 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: spark-ny-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: spark-ny-deps-job
+spec:
+  template:
+    spec:
+      restartPolicy: Never
+      volumes:
+        - name: job-deps
+          persistentVolumeClaim:
+            claimName: spark-ny-pvc
+      containers:
+        - name: aws-deps
+          image: docker.stackable.tech/stackable/tools:0.2.0-stackable0
+          env:
+            - name: DEST_DIR
+              value: "/dependencies/jars"
+            - name: AWS
+              value: "1.11.375"
+            - name: HADOOP
+              value: "3.2"
+          command:
+            [
+              "bash",
+              "-x",
+              "-o",
+              "pipefail",
+              "-c",
+              "mkdir -p ${DEST_DIR} && curl -L  https://search.maven.org/remotecontent?filepath=org/apache/hadoop/hadoop-aws/${HADOOP}.0/hadoop-aws-${HADOOP}.0.jar -o ${DEST_DIR}/hadoop-aws-${HADOOP}.0.jar && curl -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS}/aws-java-sdk-bundle-${AWS}.jar -o ${DEST_DIR}/aws-java-sdk-bundle-${AWS}.jar && chown -R stackable:stackable ${DEST_DIR} && chmod -R a=,u=rwX ${DEST_DIR}",
+            ]
+          volumeMounts:
+            - name: job-deps
+              mountPath: /dependencies
+          securityContext:
+            runAsUser: 0
diff --git a/tests/templates/kuttl/spark-ny-public-s3/03-assert.yaml b/tests/templates/kuttl/spark-ny-public-s3/03-assert.yaml
@@ -0,0 +1,7 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: list-jars-job
+status:
+  succeeded: 1
diff --git a/tests/templates/kuttl/spark-ny-public-s3/03-list-jars.yaml b/tests/templates/kuttl/spark-ny-public-s3/03-list-jars.yaml
diff --git a/tests/templates/kuttl/spark-ny-public-s3/10-assert.yaml b/tests/templates/kuttl/spark-ny-public-s3/10-assert.yaml
diff --git a/tests/templates/kuttl/spark-ny-public-s3/10-deploy-spark-app.yaml.j2 b/tests/templates/kuttl/spark-ny-public-s3/10-deploy-spark-app.yaml.j2
diff --git a/tests/templates/kuttl/spark-ny-public-s3/ny-tlc-report-1.1.0.jar b/tests/templates/kuttl/spark-ny-public-s3/ny-tlc-report-1.1.0.jar
diff --git a/tests/templates/kuttl/spark-ny-public-s3/yellow_tripdata_2021-07.csv b/tests/templates/kuttl/spark-ny-public-s3/yellow_tripdata_2021-07.csv
diff --git a/tests/test-definition.yaml b/tests/test-definition.yaml