stackabletech · razvan · Apr 10, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Added
+
+- Experimental support for Spark Connect ([#539]).
+
 ### Changed
 
 - BREAKING: Replace stackable-operator `initialize_logging` with stackable-telemetry `Tracing` ([#547], [#554]).
@@ -19,6 +23,7 @@ All notable changes to this project will be documented in this file.
 
 - Use `json` file extension for log files ([#553]).
 
+[#539]: https://github.com/stackabletech/spark-k8s-operator/pull/539
 [#547]: https://github.com/stackabletech/spark-k8s-operator/pull/547
 [#551]: https://github.com/stackabletech/spark-k8s-operator/pull/551
 [#553]: https://github.com/stackabletech/spark-k8s-operator/pull/553

diff --git a/crate-hashes.json b/crate-hashes.json
diff --git a/deploy/helm/spark-k8s-operator/crds/crds.yaml b/deploy/helm/spark-k8s-operator/crds/crds.yaml
diff --git a/deploy/helm/spark-k8s-operator/templates/roles.yaml b/deploy/helm/spark-k8s-operator/templates/roles.yaml
@@ -53,6 +53,7 @@ rules:
       - apps
     resources:
       - statefulsets
+      - deployments
     verbs:
       - create
       - delete
@@ -102,6 +103,7 @@ rules:
     resources:
       - sparkapplications
       - sparkhistoryservers
+      - sparkconnectservers
     verbs:
       - get
       - list
@@ -111,6 +113,7 @@ rules:
       - spark.stackable.tech
     resources:
       - sparkapplications/status
+      - sparkconnectservers/status
     verbs:
       - patch
   - apiGroups:

diff --git a/docs/modules/spark-k8s/examples/example-spark-connect.yaml b/docs/modules/spark-k8s/examples/example-spark-connect.yaml
@@ -0,0 +1,44 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkConnectServer
+metadata:
+  name: spark-connect # <1>
+spec:
+  image:
+    productVersion: "3.5.5" # <2>
+    pullPolicy: IfNotPresent
+  args:
+    - "--package org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1" # <3>
+  server:
+    podOverrides:
+      spec:
+        containers:
+          - name: spark
+            env:
+              - name: DEMO_GREETING # <4>
+                value: "Hello"
+    jvmArgumentOverrides:
+      add:
+        - -Dmy.custom.jvm.arg=customValue # <5>
+    config:
+      logging:
+        enableVectorAgent: False
+        containers:
+          spark:
+            custom:
+              configMap: spark-connect-log-config # <6>
+    configOverrides:
+      spark-defaults.conf:
+        spark.driver.cores: "3" # <7>
+  executor:
+    configOverrides:
+      spark-defaults.conf:
+        spark.executor.memoryOverhead: "1m" # <8>
+        spark.executor.instances: "3"
+    config:
+      logging:
+        enableVectorAgent: False
+        containers:
+          spark:
+            custom:
+              configMap: spark-connect-log-config
diff --git a/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc b/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc
@@ -0,0 +1,59 @@
+= Spark Connect
+:description: Set up a Spark Connect Server with Kubernetes as distributed execution engine with an external service to be used by clients
+:page-aliases: spark_connect.adoc
+
+WARNING: Support for Apache Spark Connect is considered experimental and is subject to change in future releases. Spark Connect is a young technology and there are important questions to be answered yet, mostly related to security and multi-tenancy.
+
+Apache Spark Connect is a remote procedure call (RPC) server that allows clients to run Spark applications on a remote cluster. Clients can connect to the Spark Connect server using a variety of programming languages, editors and IDEs without needing to install Spark locally.
+
+The Stackable Spark operator can set up Spark Connect servers backed by Kubernetes as a distributed execution engine.
+
+== Deployment
+
+The example below demonstrates how to set up a Spark Connect server and apply some customizations.
+
+[source,yaml]
+----
+include::example$example-spark-connect.yaml[]
+----
+
+<1> The name of the Spark Connect server.
+<2> Version of the Spark Connect server.
+<3> Additional package to install when starting the Spark Connect server and executors.
+<4> Environment variable to be created via `podOverrides`. Alternatively, the environment variable can be set in the `spec.server.envOverrides` section.
+<5> Additional argument to be passed to the Spark Connect JVM settings. Do not use this to tweak heap settings. Use `spec.server.jvmOptions` instead.
+<6> A custom log4j configuration file to be used by the Spark Connect server. The config map must have an entry called `log4j.properties`.
+<7> Customize the driver properties in the `server` role. The number of cores here is not related to Kubernetes cores!
+<8> Customize `spark.executor.\*` and `spark.kubernetes.executor.*` in the `executor` role.
+
+== Metrics
+
+The server pod exposes Prometheus metrics at the following endpoints:
+
+* `/metrics/prometheus` for driver instances.
+* `/metrics/executors/prometheus` for executor instances.
+
+To customize the metrics configuration use the `spec.server.configOverrides' like this:
+
+```
+spec:
+  server:
+    configOverrides:
+      metrics.properties:
+        applications.sink.prometheusServlet.path: "/metrics/applications/prometheus"
+```
+
+The example above adds a new endpoint for application metrics.
+
+== Notable Omissions
+
+The following features are not supported by the Stackable Spark operator yet
+
+* Integration with the Spark History Server.
+* Authorization and authentication. Currently, anyone with access to the Spark Connect service can run jobs.
+* Volumes and volume mounts can be added only with pod overrides.
+* Job dependencies must be provisioned as custom images or via `--packages` or `--jars` arguments.
+
+== Known Issues
+
+* Dynamically provisioning the iceberg runtime leads to "iceberg.SparkWrite$WriterFactory" ClassNotfoundException when attempting to use it from clients.
diff --git a/docs/modules/spark-k8s/partials/nav.adoc b/docs/modules/spark-k8s/partials/nav.adoc
@@ -9,6 +9,7 @@
 ** xref:spark-k8s:usage-guide/security.adoc[]
 ** xref:spark-k8s:usage-guide/logging.adoc[]
 ** xref:spark-k8s:usage-guide/history-server.adoc[]
+** xref:spark-k8s:usage-guide/spark-connect.adoc[]
 ** xref:spark-k8s:usage-guide/examples.adoc[]
 ** xref:spark-k8s:usage-guide/overrides.adoc[]
 ** xref:spark-k8s:usage-guide/operations/index.adoc[]

diff --git a/rust/operator-binary/src/connect/common.rs b/rust/operator-binary/src/connect/common.rs
@@ -0,0 +1,152 @@
+use std::collections::{BTreeMap, HashMap};
+
+use product_config::writer::to_java_properties_string;
+use snafu::{ResultExt, Snafu};
+use stackable_operator::{
+    kvp::ObjectLabels,
+    role_utils::{JavaCommonConfig, JvmArgumentOverrides},
+};
+use strum::Display;
+
+use super::crd::CONNECT_EXECUTOR_ROLE_NAME;
+use crate::{
+    connect::crd::{
+        CONNECT_CONTROLLER_NAME, CONNECT_SERVER_ROLE_NAME, DUMMY_SPARK_CONNECT_GROUP_NAME,
+    },
+    crd::constants::{APP_NAME, OPERATOR_NAME},
+};
+
+#[derive(Snafu, Debug)]
+#[allow(clippy::enum_variant_names)]
+pub enum Error {
+    #[snafu(display("failed to merge jvm argument overrides"))]
+    MergeJvmArgumentOverrides {
+        source: stackable_operator::role_utils::Error,
+    },
+
+    #[snafu(display("failed to serialize spark properties"))]
+    SparkProperties {
+        source: product_config::writer::PropertiesWriterError,
+    },
+
+    #[snafu(display("failed to serialize jvm security properties",))]
+    JvmSecurityProperties {
+        source: product_config::writer::PropertiesWriterError,
+    },
+}
+
+pub(crate) fn labels<'a, T>(
+    scs: &'a T,
+    app_version_label: &'a str,
+    role: &'a str,
+) -> ObjectLabels<'a, T> {
+    ObjectLabels {
+        owner: scs,
+        app_name: APP_NAME,
+        app_version: app_version_label,
+        operator_name: OPERATOR_NAME,
+        controller_name: CONNECT_CONTROLLER_NAME,
+        role,
+        role_group: DUMMY_SPARK_CONNECT_GROUP_NAME,
+    }
+}
+
+// The dead code annotation is to shut up complains about missing Executor instantiations
+// These will come in the future.
+#[allow(dead_code)]
+#[derive(Clone, Debug, Display)]
+#[strum(serialize_all = "lowercase")]
+pub(crate) enum SparkConnectRole {
+    Server,
+    Executor,
+}
+
+pub(crate) fn object_name(stacklet_name: &str, role: SparkConnectRole) -> String {
+    match role {
+        SparkConnectRole::Server => format!("{}-{}", stacklet_name, CONNECT_SERVER_ROLE_NAME),
+        SparkConnectRole::Executor => format!("{}-{}", stacklet_name, CONNECT_EXECUTOR_ROLE_NAME),
+    }
+}
+
+// Returns the jvm arguments a user has provided merged with the operator props.
+pub(crate) fn jvm_args(
+    jvm_args: &[String],
+    user_java_config: Option<&JavaCommonConfig>,
+) -> Result<String, Error> {
+    if let Some(user_jvm_props) = user_java_config {
+        let operator_generated = JvmArgumentOverrides::new_with_only_additions(jvm_args.to_vec());
+        let mut user_jvm_props_copy = user_jvm_props.jvm_argument_overrides.clone();
+        user_jvm_props_copy
+            .try_merge(&operator_generated)
+            .context(MergeJvmArgumentOverridesSnafu)?;
+        Ok(user_jvm_props_copy
+            .effective_jvm_config_after_merging()
+            .join(" "))
+    } else {
+        Ok(jvm_args.join(" "))
+    }
+}
+
+// Merges server and executor properties and renders the contents
+// of the Spark properties file.
+pub(crate) fn spark_properties(
+    props: &[BTreeMap<String, Option<String>>; 2],
+) -> Result<String, Error> {
+    let mut result = BTreeMap::new();
+    for p in props {
+        result.extend(p);
+    }
+    to_java_properties_string(result.into_iter()).context(SparkPropertiesSnafu)
+}
+
+pub(crate) fn security_properties(
+    config_overrides: Option<&HashMap<String, String>>,
+) -> Result<String, Error> {
+    let mut result: BTreeMap<String, Option<String>> = [
+        (
+            "networkaddress.cache.ttl".to_string(),
+            Some("30".to_string()),
+        ),
+        (
+            "networkaddress.cache.negative.ttl".to_string(),
+            Some("0".to_string()),
+        ),
+    ]
+    .into();
+
+    if let Some(user_config) = config_overrides {
+        result.extend(
+            user_config
+                .iter()
+                .map(|(k, v)| (k.clone(), Some(v.clone()))),
+        );
+    }
+
+    to_java_properties_string(result.iter()).context(JvmSecurityPropertiesSnafu)
+}
+
+pub(crate) fn metrics_properties(
+    config_overrides: Option<&HashMap<String, String>>,
+) -> Result<String, Error> {
+    let mut result: BTreeMap<String, Option<String>> = [
+        (
+            "*.sink.prometheusServlet.class".to_string(),
+            Some("org.apache.spark.metrics.sink.PrometheusServlet".to_string()),
+        ),
+        (
+            "*.sink.prometheusServlet.path".to_string(),
+            Some("/metrics/prometheus".to_string()),
+        ),
+    ]
+    .into();
+
+    if let Some(user_config) = config_overrides {
+        result.extend(
+            user_config
+                .iter()
+                .map(|(k, v)| (k.clone(), Some(v.clone()))),
+        );
+    }
+
+    to_java_properties_string(result.iter()).context(JvmSecurityPropertiesSnafu)
+}