stackabletech · adwk67 · Mar 29, 2022 · Mar 29, 2022 · Mar 29, 2022 · Mar 29, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ All notable changes to this project will be documented in this file.
 - Initial commit
 - ServiceAccount, ClusterRole and RoleBinding for Spark driver ([#39])
 - S3 credentials can be provided via a Secret ([#42])
+- Job information can be passed via a configuration map ([#50])
 
 [#39]: https://github.com/stackabletech/spark-k8s-operator/pull/39
 [#42]: https://github.com/stackabletech/spark-k8s-operator/pull/42
+[#50]: https://github.com/stackabletech/spark-k8s-operator/pull/50
diff --git a/deploy/crd/sparkapplication.crd.yaml b/deploy/crd/sparkapplication.crd.yaml
@@ -85,6 +85,19 @@ spec:
                 driver:
                   nullable: true
                   properties:
+                    configMapMounts:
+                      items:
+                        properties:
+                          configMapName:
+                            type: string
+                          path:
+                            type: string
+                        required:
+                          - configMapName
+                          - path
+                        type: object
+                      nullable: true
+                      type: array
                     coreLimit:
                       nullable: true
                       type: string
@@ -209,6 +222,19 @@ spec:
                 executor:
                   nullable: true
                   properties:
+                    configMapMounts:
+                      items:
+                        properties:
+                          configMapName:
+                            type: string
+                          path:
+                            type: string
+                        required:
+                          - configMapName
+                          - path
+                        type: object
+                      nullable: true
+                      type: array
                     cores:
                       format: uint
                       minimum: 0.0

diff --git a/deploy/helm/spark-k8s-operator/crds/crds.yaml b/deploy/helm/spark-k8s-operator/crds/crds.yaml
@@ -87,6 +87,19 @@ spec:
                 driver:
                   nullable: true
                   properties:
+                    configMapMounts:
+                      items:
+                        properties:
+                          configMapName:
+                            type: string
+                          path:
+                            type: string
+                        required:
+                          - configMapName
+                          - path
+                        type: object
+                      nullable: true
+                      type: array
                     coreLimit:
                       nullable: true
                       type: string
@@ -211,6 +224,19 @@ spec:
                 executor:
                   nullable: true
                   properties:
+                    configMapMounts:
+                      items:
+                        properties:
+                          configMapName:
+                            type: string
+                          path:
+                            type: string
+                        required:
+                          - configMapName
+                          - path
+                        type: object
+                      nullable: true
+                      type: array
                     cores:
                       format: uint
                       minimum: 0.0

diff --git a/deploy/manifests/crds.yaml b/deploy/manifests/crds.yaml
@@ -88,6 +88,19 @@ spec:
                 driver:
                   nullable: true
                   properties:
+                    configMapMounts:
+                      items:
+                        properties:
+                          configMapName:
+                            type: string
+                          path:
+                            type: string
+                        required:
+                          - configMapName
+                          - path
+                        type: object
+                      nullable: true
+                      type: array
                     coreLimit:
                       nullable: true
                       type: string
@@ -212,6 +225,19 @@ spec:
                 executor:
                   nullable: true
                   properties:
+                    configMapMounts:
+                      items:
+                        properties:
+                          configMapName:
+                            type: string
+                          path:
+                            type: string
+                        required:
+                          - configMapName
+                          - path
+                        type: object
+                      nullable: true
+                      type: array
                     cores:
                       format: uint
                       minimum: 0.0

diff --git a/docs/modules/ROOT/examples/example-configmap.yaml b/docs/modules/ROOT/examples/example-configmap.yaml
@@ -0,0 +1,8 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cm-job-arguments # <1>
+data:
+  job-args.txt: |
+    s3a://nyc-tlc/trip data/yellow_tripdata_2021-07.csv # <2>
diff --git a/docs/modules/ROOT/examples/example-sparkapp-configmap.yaml b/docs/modules/ROOT/examples/example-sparkapp-configmap.yaml
@@ -0,0 +1,42 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: ny-tlc-report-configmap
+  namespace: default
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/spark-k8s:3.2.1-hadoop3.2-stackable0.4.0
+  mode: cluster
+  mainApplicationFile: s3a://stackable-spark-k8s-jars/jobs/ny-tlc-report-1.1.0.jar # <3>
+  mainClass: tech.stackable.demo.spark.NYTLCReport
+  volumes:
+    - name: job-deps
+      persistentVolumeClaim:
+        claimName: pvc-ksv
+  args:
+    - "--input /arguments/job-args.txt" # <4>
+  sparkConf:
+    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
+    "spark.driver.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+    "spark.executor.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+    configMapMounts:
+      - configMapName: cm-job-arguments # <5>
+        path: /arguments  # <6>
+  executor:
+    cores: 1
+    instances: 3
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+    configMapMounts:
+      - configMapName: cm-job-arguments # <5>
+        path: /arguments # <6>
diff --git a/docs/modules/ROOT/pages/rbac.adoc b/docs/modules/ROOT/pages/rbac.adoc
@@ -32,4 +32,4 @@ then the cluster-role has to be created assigned to the service account manually
 [source,bash]
 ----
 kubectl create clusterrolebinding spark-role --clusterrole=spark-driver-edit-role  --serviceaccount=default:default
-----
+----
diff --git a/docs/modules/ROOT/pages/usage.adoc b/docs/modules/ROOT/pages/usage.adoc
@@ -92,15 +92,31 @@ include::example$example-sparkapp-pvc.yaml[]
 include::example$example-sparkapp-s3-private.yaml[]
 ----
 
-<1> Job python artifact (local)
+<1> Job python artifact (located in S3)
 <2> Artifact class
-<3> S3 section, specifying the existing secret and S3 end-point ( in this case, Min-IO)
+<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO)
 <4> Credentials secret
 <5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources...
 <6> ...in this case, in s3, accessed with the credentials defined in the secret
 <7> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing
 <8> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors
 
+=== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map
+
+[source,yaml]
+----
+include::example$example-configmap.yaml[]
+----
+[source,yaml]
+----
+include::example$example-sparkapp-configmap.yaml[]
+----
+<1> Name of the configuration map
+<2> Argument required by the job
+<3> Job scala artifact that requires an input argument
+<4> The expected job argument, accessed via the mounted configuration map file
+<5> The name of the configuration map that will be mounted to the driver/executor
+<6> The mount location of the configuration map (this will contain a file `/arguments/job-args.txt`)
 
 == CRD argument coverage
 
@@ -187,6 +203,12 @@ Below are listed the CRD fields that can be defined by the user:
 |`spec.driver.volumeMounts.mountPath`
 |Volume mount path
 
+|`spec.driver.configMapMounts.configMapName`
+|Name of configuration map to be mounted in the driver
+
+|`spec.driver.configMapMounts.path`
+|Mount path of the configuration map in the driver
+
 |`spec.executor.cores`
 |Number of cores for each executor
 
@@ -204,4 +226,11 @@ Below are listed the CRD fields that can be defined by the user:
 
 |`spec.executor.volumeMounts.mountPath`
 |Volume mount path
+
+|`spec.executor.configMapMounts.configMapName`
+|Name of configuration map to be mounted in the executor
+
+|`spec.executor.configMapMounts.path`
+|Mount path of the configuration map in the executor
 |===
+
diff --git a/examples/ny-tlc-report-configmap.yaml b/examples/ny-tlc-report-configmap.yaml
@@ -0,0 +1,50 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cm-job-arguments
+data:
+  job-args.txt: |
+    s3a://nyc-tlc/trip data/yellow_tripdata_2021-07.csv
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: ny-tlc-report-configmap
+  namespace: default
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/spark-k8s:3.2.1-hadoop3.2-stackable0.4.0
+  mode: cluster
+  mainApplicationFile: s3a://stackable-spark-k8s-jars/jobs/ny-tlc-report-1.1.0.jar
+  mainClass: tech.stackable.demo.spark.NYTLCReport
+  volumes:
+    - name: job-deps
+      persistentVolumeClaim:
+        claimName: pvc-ksv
+  args:
+    - "--input /arguments/job-args.txt"
+  sparkConf:
+    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
+    "spark.driver.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+    "spark.executor.extraClassPath": "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+    configMapMounts:
+      - configMapName: cm-job-arguments
+        path: /arguments
+  executor:
+    cores: 1
+    instances: 3
+    memory: "512m"
+    volumeMounts:
+      - name: job-deps
+        mountPath: /dependencies
+    configMapMounts:
+      - configMapName: cm-job-arguments
+        path: /arguments
diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs
@@ -169,6 +169,26 @@ impl SparkApplication {
         tmp.iter().flat_map(|v| v.iter()).cloned().collect()
     }
 
+    pub fn executor_config_map_mounts(&self) -> Vec<ConfigMapMount> {
+        let tmp = self
+            .spec
+            .executor
+            .as_ref()
+            .and_then(|executor_conf| executor_conf.config_map_mounts.clone());
+
+        tmp.iter().flat_map(|v| v.iter()).cloned().collect()
+    }
+
+    pub fn driver_config_map_mounts(&self) -> Vec<ConfigMapMount> {
+        let tmp = self
+            .spec
+            .driver
+            .as_ref()
+            .and_then(|driver_conf| driver_conf.config_map_mounts.clone());
+
+        tmp.iter().flat_map(|v| v.iter()).cloned().collect()
+    }
+
     pub fn executor_volume_mounts(&self) -> Vec<VolumeMount> {
         let tmp = self
             .spec
@@ -287,6 +307,13 @@ pub struct CommonConfig {
     pub enable_monitoring: Option<bool>,
 }
 
+#[derive(Clone, Debug, Default, Deserialize, JsonSchema, PartialEq, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct ConfigMapMount {
+    pub config_map_name: String,
+    pub path: String,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, JsonSchema, PartialEq, Serialize)]
 #[serde(rename_all = "camelCase")]
 pub struct DriverConfig {
@@ -295,6 +322,8 @@ pub struct DriverConfig {
     pub memory: Option<String>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub volume_mounts: Option<Vec<VolumeMount>>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub config_map_mounts: Option<Vec<ConfigMapMount>>,
 }
 
 impl DriverConfig {
@@ -323,6 +352,8 @@ pub struct ExecutorConfig {
     pub memory: Option<String>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub volume_mounts: Option<Vec<VolumeMount>>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub config_map_mounts: Option<Vec<ConfigMapMount>>,
 }
 
 impl ExecutorConfig {
-Original file line number
+Diff line change
@@ Expand Up @@
     [source,bash]
     ----
     kubectl create clusterrolebinding spark-role --clusterrole=spark-driver-edit-role  --serviceaccount=default:default
-    ----
+    ----