stackabletech · razvan · Feb 15, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,10 @@ All notable changes to this project will be documented in this file.
 - Various documentation of the CRD ([#319]).
 - [BREAKING] Removed version field. Several attributes have been changed to mandatory. While this change is
   technically breaking, existing Spark jobs would not have worked before as these attributes were necessary ([#319]).
+- [BREAKING] Remove `userClassPathFirst` properties from `spark-submit`. This is an experimental feature that was
+  introduced to support logging in XML format. The side effect of this removal is that the vector agent cannot
+  aggregate output from the `spark-submit` containers. On the other side, it enables dynamic provisionining of
+  delta.io packages (among others) with Stackable stock images which is much more important. ([#355])
 
 ### Fixed
 
@@ -22,6 +26,7 @@ All notable changes to this project will be documented in this file.
 [#313]: https://github.com/stackabletech/spark-k8s-operator/pull/313
 [#319]: https://github.com/stackabletech/spark-k8s-operator/pull/319
 [#344]: https://github.com/stackabletech/spark-k8s-operator/pull/344
+[#355]: https://github.com/stackabletech/spark-k8s-operator/pull/355
 
 ## [23.11.0] - 2023-11-24
 

diff --git a/docs/modules/spark-k8s/pages/usage-guide/logging.adoc b/docs/modules/spark-k8s/pages/usage-guide/logging.adoc
@@ -0,0 +1,8 @@
+= Logging
+
+The Spark operator installs a https://vector.dev/docs/setup/deployment/roles/#agent[vector agent] as a side-car container in every application Pod except the `job` Pod that runs `spark-submit`. It also configures the logging framework to output logs in XML format. This is the same https://logging.apache.org/log4j/2.x/manual/layouts.html#XMLLayout[format] used across all Stackable products and it enables the https://vector.dev/docs/setup/deployment/roles/#aggregator[vector aggregator] to collect  logs across the entire platform.
+
+It is the user's responsibility to install and configure the vector aggregator, but the agents can discover the aggregator automatically using a discovery ConfigMap as described in the xref:concepts:logging.adoc[logging concepts]. 
+
+NOTE: Only logs produced by the application's driver and executors are collected. Logs produced by `spark-submit` are discarded.
+
diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs
@@ -253,7 +253,7 @@ impl SparkApplication {
         &self,
         s3conn: &Option<S3ConnectionSpec>,
         s3logdir: &Option<S3LogDir>,
-        log_config_map: &str,
+        log_config_map: Option<&str>,
     ) -> Result<Vec<Volume>, Error> {
         let mut result: Vec<Volume> = self.spec.volumes.clone();
 
@@ -294,11 +294,13 @@ impl SparkApplication {
             }
         }
 
-        result.push(
-            VolumeBuilder::new(VOLUME_MOUNT_NAME_LOG_CONFIG)
-                .with_config_map(log_config_map)
-                .build(),
-        );
+        if let Some(log_config_map) = log_config_map {
+            result.push(
+                VolumeBuilder::new(VOLUME_MOUNT_NAME_LOG_CONFIG)
+                    .with_config_map(log_config_map)
+                    .build(),
+            );
+        }
 
         result.push(
             VolumeBuilder::new(VOLUME_MOUNT_NAME_LOG)
@@ -358,14 +360,15 @@ impl SparkApplication {
                 ..VolumeMount::default()
             },
         ];
-        self.add_common_volume_mounts(volume_mounts, s3conn, s3logdir)
+        self.add_common_volume_mounts(volume_mounts, s3conn, s3logdir, false)
     }
 
     fn add_common_volume_mounts(
         &self,
         mut mounts: Vec<VolumeMount>,
         s3conn: &Option<S3ConnectionSpec>,
         s3logdir: &Option<S3LogDir>,
+        logging_enabled: bool,
     ) -> Vec<VolumeMount> {
         if self.spec.image.is_some() {
             mounts.push(VolumeMount {
@@ -401,17 +404,19 @@ impl SparkApplication {
             mounts.push(vm);
         }
 
-        mounts.push(VolumeMount {
-            name: VOLUME_MOUNT_NAME_LOG_CONFIG.into(),
-            mount_path: VOLUME_MOUNT_PATH_LOG_CONFIG.into(),
-            ..VolumeMount::default()
-        });
+        if logging_enabled {
+            mounts.push(VolumeMount {
+                name: VOLUME_MOUNT_NAME_LOG_CONFIG.into(),
+                mount_path: VOLUME_MOUNT_PATH_LOG_CONFIG.into(),
+                ..VolumeMount::default()
+            });
 
-        mounts.push(VolumeMount {
-            name: VOLUME_MOUNT_NAME_LOG.into(),
-            mount_path: VOLUME_MOUNT_PATH_LOG.into(),
-            ..VolumeMount::default()
-        });
+            mounts.push(VolumeMount {
+                name: VOLUME_MOUNT_NAME_LOG.into(),
+                mount_path: VOLUME_MOUNT_PATH_LOG.into(),
+                ..VolumeMount::default()
+            });
+        }
 
         if !self.packages().is_empty() {
             mounts.push(VolumeMount {
@@ -484,10 +489,8 @@ impl SparkApplication {
             format!("--conf spark.kubernetes.authenticate.driver.serviceAccountName={}", serviceaccount_name),
             format!("--conf spark.driver.defaultJavaOptions=-Dlog4j.configurationFile={VOLUME_MOUNT_PATH_LOG_CONFIG}/{LOG4J2_CONFIG_FILE}"),
             format!("--conf spark.driver.extraClassPath=/stackable/spark/extra-jars/*"),
-            "--conf spark.driver.userClassPathFirst=true".to_string(),
             format!("--conf spark.executor.defaultJavaOptions=-Dlog4j.configurationFile={VOLUME_MOUNT_PATH_LOG_CONFIG}/{LOG4J2_CONFIG_FILE}"),
             format!("--conf spark.executor.extraClassPath=/stackable/spark/extra-jars/*"),
-            "--conf spark.executor.userClassPathFirst=true".to_string(),
         ]);
 
         // See https://spark.apache.org/docs/latest/running-on-kubernetes.html#dependency-management

diff --git a/rust/crd/src/roles.rs b/rust/crd/src/roles.rs
@@ -147,7 +147,7 @@ impl RoleConfig {
         s3logdir: &Option<S3LogDir>,
     ) -> Vec<VolumeMount> {
         let volume_mounts = self.volume_mounts.clone().unwrap_or_default().into();
-        spark_application.add_common_volume_mounts(volume_mounts, s3conn, s3logdir)
+        spark_application.add_common_volume_mounts(volume_mounts, s3conn, s3logdir, true)
     }
 }
 
@@ -200,8 +200,6 @@ impl Configuration for RoleConfigFragment {
 pub struct SubmitConfig {
     #[fragment_attrs(serde(default))]
     pub resources: Resources<SparkStorageConfig, NoRuntimeLimits>,
-    #[fragment_attrs(serde(default))]
-    pub logging: Logging<SparkContainer>,
 }
 
 impl SubmitConfig {
@@ -218,7 +216,6 @@ impl SubmitConfig {
                 },
                 storage: SparkStorageConfigFragment {},
             },
-            logging: product_logging::spec::default_logging(),
         }
     }
 }

diff --git a/rust/operator-binary/src/spark_k8s_controller.rs b/rust/operator-binary/src/spark_k8s_controller.rs
@@ -318,8 +318,6 @@ pub async fn reconcile(spark_application: Arc<SparkApplication>, ctx: Arc<Ctx>)
     let submit_job_config_map = submit_job_config_map(
         &spark_application,
         submit_product_config,
-        vector_aggregator_address.as_deref(),
-        &submit_config.logging,
         &resolved_product_image,
     )?;
     client
@@ -580,7 +578,7 @@ fn pod_template_config_map(
     };
 
     let mut volumes = spark_application
-        .volumes(s3conn, s3logdir, &log_config_map)
+        .volumes(s3conn, s3logdir, Some(&log_config_map))
         .context(CreateVolumesSnafu)?;
     volumes.push(
         VolumeBuilder::new(VOLUME_MOUNT_NAME_CONFIG)
@@ -657,8 +655,6 @@ fn pod_template_config_map(
 fn submit_job_config_map(
     spark_application: &SparkApplication,
     product_config: Option<&HashMap<PropertyNameKind, BTreeMap<String, String>>>,
-    vector_aggregator_address: Option<&str>,
-    logging: &Logging<SparkContainer>,
     spark_image: &ResolvedProductImage,
 ) -> Result<ConfigMap> {
     let cm_name = spark_application.submit_job_config_map_name();
@@ -679,20 +675,6 @@ fn submit_job_config_map(
             .build(),
     );
 
-    product_logging::extend_config_map(
-        &RoleGroupRef {
-            cluster: ObjectRef::from_obj(spark_application),
-            role: String::new(),
-            role_group: String::new(),
-        },
-        vector_aggregator_address,
-        logging,
-        SparkContainer::SparkSubmit,
-        SparkContainer::Vector,
-        &mut cm_builder,
-    )
-    .context(InvalidLoggingConfigSnafu { cm_name })?;
-
     if let Some(product_config) = product_config {
         let jvm_sec_props: BTreeMap<String, Option<String>> = product_config
             .get(&PropertyNameKind::File(
@@ -731,27 +713,7 @@ fn spark_job(
     let mut cb = ContainerBuilder::new(&SparkContainer::SparkSubmit.to_string())
         .context(IllegalContainerNameSnafu)?;
 
-    let log_config_map = if let Some(ContainerLogConfig {
-        choice:
-            Some(ContainerLogConfigChoice::Custom(CustomContainerLogConfig {
-                custom: ConfigMapLogConfig { config_map },
-            })),
-    }) = job_config
-        .logging
-        .containers
-        .get(&SparkContainer::SparkSubmit)
-    {
-        config_map.into()
-    } else {
-        spark_application.submit_job_config_map_name()
-    };
-
-    let mut args = vec![job_commands.join(" ")];
-    if job_config.logging.enable_vector_agent {
-        // Wait for Vector to gather the logs.
-        args.push("sleep 10".into());
-        args.push(create_vector_shutdown_file_command(VOLUME_MOUNT_PATH_LOG));
-    }
+    let args = vec![job_commands.join(" ")];
 
     cb.image_from_product_image(spark_image)
         .command(vec!["/bin/bash".to_string(), "-c".to_string()])
@@ -762,8 +724,7 @@ fn spark_job(
         .add_env_var(
             "SPARK_SUBMIT_OPTS",
             format!(
-                "-cp /stackable/spark/extra-jars/*:/stackable/spark/jars/* \
-                -Dlog4j.configurationFile={VOLUME_MOUNT_PATH_LOG_CONFIG}/{LOG4J2_CONFIG_FILE}"
+                "-Dlog4j.configurationFile={VOLUME_MOUNT_PATH_LOG_CONFIG}/{LOG4J2_CONFIG_FILE}"
             ),
         )
         // TODO: move this to the image
@@ -786,26 +747,11 @@ fn spark_job(
     ];
     volumes.extend(
         spark_application
-            .volumes(s3conn, s3logdir, &log_config_map)
+            .volumes(s3conn, s3logdir, None)
             .context(CreateVolumesSnafu)?,
     );
 
-    let mut containers = vec![cb.build()];
-
-    if job_config.logging.enable_vector_agent {
-        containers.push(vector_container(
-            spark_image,
-            VOLUME_MOUNT_NAME_CONFIG,
-            VOLUME_MOUNT_NAME_LOG,
-            job_config.logging.containers.get(&SparkContainer::Vector),
-            ResourceRequirementsBuilder::new()
-                .with_cpu_request("250m")
-                .with_cpu_limit("500m")
-                .with_memory_request("128Mi")
-                .with_memory_limit("128Mi")
-                .build(),
-        ));
-    }
+    let containers = vec![cb.build()];
 
     let mut pod =
         PodTemplateSpec {

diff --git a/tests/templates/kuttl/delta/01-assert.yaml.j2 b/tests/templates/kuttl/delta/01-assert.yaml.j2
@@ -0,0 +1,10 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+{% if lookup('env', 'VECTOR_AGGREGATOR') %}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vector-aggregator-discovery
+{% endif %}
diff --git a/tests/templates/kuttl/delta/01-install-vector-aggregator-discovery-configmap.yaml.j2 b/tests/templates/kuttl/delta/01-install-vector-aggregator-discovery-configmap.yaml.j2
@@ -0,0 +1,9 @@
+{% if lookup('env', 'VECTOR_AGGREGATOR') %}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vector-aggregator-discovery
+data:
+  ADDRESS: {{ lookup('env', 'VECTOR_AGGREGATOR') }}
+{% endif %}
diff --git a/tests/templates/kuttl/delta/10-s3-secret.yaml b/tests/templates/kuttl/delta/10-s3-secret.yaml
@@ -0,0 +1,25 @@
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: minio-credentials
+  labels:
+    secrets.stackable.tech/class: s3-credentials-class
+timeout: 240
+stringData:
+  accessKey: minioAccessKey
+  secretKey: minioSecretKey
+  # The following two entries are used by the Bitnami chart for MinIO to
+  # set up credentials for accessing buckets managed by the MinIO tenant.
+  root-user: minioAccessKey
+  root-password: minioSecretKey
+---
+apiVersion: secrets.stackable.tech/v1alpha1
+kind: SecretClass
+metadata:
+  name: s3-credentials-class
+spec:
+  backend:
+    k8sSearch:
+      searchNamespace:
+        pod: {}
diff --git a/tests/templates/kuttl/delta/20-assert.yaml b/tests/templates/kuttl/delta/20-assert.yaml
@@ -0,0 +1,20 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 900
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-minio
+status:
+  readyReplicas: 1
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: minio-client
+  labels:
+    app: minio-client
+status:
+  phase: Running
diff --git a/tests/templates/kuttl/delta/20-setup-minio.yaml b/tests/templates/kuttl/delta/20-setup-minio.yaml
@@ -0,0 +1,48 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+commands:
+  - script: >-
+      helm install test-minio
+      --namespace $NAMESPACE
+      --version 11.9.2
+      -f helm-bitnami-minio-values.yaml
+      --repo https://charts.bitnami.com/bitnami minio
+    timeout: 240
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: minio-client
+  labels:
+    app: minio-client
+spec:
+  selector:
+    matchLabels:
+      app: minio-client
+  restartPolicy: Never
+  containers:
+    - name: minio-client
+      image: docker.io/bitnami/minio-client:2022.8.11-debian-11-r3
+      command: ["bash", "-c", "sleep infinity"]
+      stdin: true
+      tty: true
+      env:
+        - name: MINIO_SERVER_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio-credentials
+              key: root-user
+              optional: false
+        - name: MINIO_SERVER_SECRET_KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio-credentials
+              key: root-password
+              optional: false
+        - name: MINIO_SERVER_HOST
+          value: test-minio
+        - name: MINIO_SERVER_PORT_NUMBER
+          value: "9000"
+        - name: MINIO_SERVER_SCHEME
+          value: http
diff --git a/tests/templates/kuttl/delta/30-prepare-bucket.yaml b/tests/templates/kuttl/delta/30-prepare-bucket.yaml
@@ -0,0 +1,9 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+commands:
+  # give minio enough time to start
+  - command: sleep 10
+  - command: kubectl exec -n $NAMESPACE minio-client -- sh -c 'mc alias set test-minio http://test-minio:9000 $$MINIO_SERVER_ACCESS_KEY $$MINIO_SERVER_SECRET_KEY'
+  - command: kubectl exec -n $NAMESPACE minio-client -- mc mb test-minio/my-bucket
+  - command: kubectl exec -n $NAMESPACE minio-client -- mc policy set public test-minio/my-bucket
diff --git a/tests/templates/kuttl/delta/40-assert.yaml b/tests/templates/kuttl/delta/40-assert.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 300
+---
+# The Job starting the whole process
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: pyspark-delta
+status:
+  phase: Succeeded