Skip to content

Commit b166061

Browse files
razvannightkr
andauthored
feat: run containerdebug in the background (#508)
* feat: run containerdebug in the background * various fixes * reorganize the smoke test files * update changelog * fix containerdebug log dir * successfully ran tests * reintroduce the CONTAINERDEBUG_LOG_DIRECTORY env var * less verbose test output * ensure vol mount exists and remove env duplicates * Update rust/crd/src/lib.rs Co-authored-by: Natalie Klestrup Röijezon <[email protected]> --------- Co-authored-by: Natalie Klestrup Röijezon <[email protected]>
1 parent ba48ce6 commit b166061

19 files changed

+151
-58
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ All notable changes to this project will be documented in this file.
88

99
- The lifetime of auto generated TLS certificates is now configurable with the role and roleGroup
1010
config property `requestedSecretLifetime`. This helps reducing frequent Pod restarts ([#501]).
11+
- Run a `containerdebug` process in the background of each Spark container to collect debugging information ([#508]).
1112

1213
[#501]: https://github.com/stackabletech/spark-k8s-operator/pull/501
14+
[#508]: https://github.com/stackabletech/spark-k8s-operator/pull/508
1315

1416
## [24.11.0] - 2024-11-18
1517

rust/crd/src/history.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,16 @@ impl SparkHistoryServer {
242242
let mut vars: BTreeMap<String, EnvVar> = BTreeMap::new();
243243
let role_env_overrides = &self.role().config.env_overrides;
244244

245+
// Needed by the `containerdebug` running in the background of the history container
246+
// to log it's tracing information to.
247+
vars.insert(
248+
"CONTAINERDEBUG_LOG_DIRECTORY".to_string(),
249+
EnvVar {
250+
name: "CONTAINERDEBUG_LOG_DIRECTORY".to_string(),
251+
value: Some(format!("{VOLUME_MOUNT_PATH_LOG}/containerdebug")),
252+
value_from: None,
253+
},
254+
);
245255
// This env var prevents the history server from detaching itself from the
246256
// start script because this leads to the Pod terminating immediately.
247257
vars.insert(

rust/crd/src/lib.rs

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -326,18 +326,18 @@ impl SparkApplication {
326326
.with_config_map(log_config_map)
327327
.build(),
328328
);
329-
330-
result.push(
331-
VolumeBuilder::new(VOLUME_MOUNT_NAME_LOG)
332-
.with_empty_dir(
333-
None::<String>,
334-
Some(product_logging::framework::calculate_log_volume_size_limit(
335-
&[MAX_SPARK_LOG_FILES_SIZE, MAX_INIT_LOG_FILES_SIZE],
336-
)),
337-
)
338-
.build(),
339-
);
340329
}
330+
// This volume is also used by the containerdebug process so it must always be there.
331+
result.push(
332+
VolumeBuilder::new(VOLUME_MOUNT_NAME_LOG)
333+
.with_empty_dir(
334+
None::<String>,
335+
Some(product_logging::framework::calculate_log_volume_size_limit(
336+
&[MAX_SPARK_LOG_FILES_SIZE, MAX_INIT_LOG_FILES_SIZE],
337+
)),
338+
)
339+
.build(),
340+
);
341341

342342
if !self.packages().is_empty() {
343343
result.push(
@@ -466,14 +466,16 @@ impl SparkApplication {
466466
mount_path: VOLUME_MOUNT_PATH_LOG_CONFIG.into(),
467467
..VolumeMount::default()
468468
});
469-
470-
mounts.push(VolumeMount {
471-
name: VOLUME_MOUNT_NAME_LOG.into(),
472-
mount_path: VOLUME_MOUNT_PATH_LOG.into(),
473-
..VolumeMount::default()
474-
});
475469
}
476470

471+
// This is used at least by the containerdebug process.
472+
// The volume is always there.
473+
mounts.push(VolumeMount {
474+
name: VOLUME_MOUNT_NAME_LOG.into(),
475+
mount_path: VOLUME_MOUNT_PATH_LOG.into(),
476+
..VolumeMount::default()
477+
});
478+
477479
if !self.packages().is_empty() {
478480
mounts.push(VolumeMount {
479481
name: VOLUME_MOUNT_NAME_IVY2.into(),
@@ -527,9 +529,7 @@ impl SparkApplication {
527529
let mode = &self.spec.mode;
528530
let name = self.metadata.name.clone().context(ObjectHasNoNameSnafu)?;
529531

530-
let mut submit_cmd: Vec<String> = vec![];
531-
532-
submit_cmd.extend(vec![
532+
let mut submit_cmd = vec![
533533
"/stackable/spark/bin/spark-submit".to_string(),
534534
"--verbose".to_string(),
535535
"--master k8s://https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT_HTTPS}".to_string(),
@@ -547,7 +547,7 @@ impl SparkApplication {
547547
format!("--conf spark.driver.extraClassPath=/stackable/spark/extra-jars/*"),
548548
format!("--conf spark.executor.defaultJavaOptions=-Dlog4j.configurationFile={VOLUME_MOUNT_PATH_LOG_CONFIG}/{LOG4J2_CONFIG_FILE}"),
549549
format!("--conf spark.executor.extraClassPath=/stackable/spark/extra-jars/*"),
550-
]);
550+
];
551551

552552
// See https://spark.apache.org/docs/latest/running-on-kubernetes.html#dependency-management
553553
// for possible S3 related properties
@@ -676,7 +676,10 @@ impl SparkApplication {
676676

677677
submit_cmd.extend(self.spec.args.clone());
678678

679-
Ok(submit_cmd)
679+
Ok(vec![
680+
format!("containerdebug --output={VOLUME_MOUNT_PATH_LOG}/containerdebug-state.json --loop &"),
681+
submit_cmd.join(" "),
682+
])
680683
}
681684

682685
pub fn env(
@@ -685,6 +688,27 @@ impl SparkApplication {
685688
logdir: &Option<ResolvedLogDir>,
686689
) -> Vec<EnvVar> {
687690
let mut e: Vec<EnvVar> = self.spec.env.clone();
691+
692+
// These env variables enable the `containerdebug` process in driver and executor pods.
693+
// More precisely, this process runs in the background of every `spark` container.
694+
// - `CONTAINERDEBUG_LOG_DIRECTORY` - is the location where tracing information from the process
695+
// is written. This directory is created by the process itself.
696+
// - `_STACKABLE_PRE_HOOK` - is evaluated by the entrypoint script (run-spark.sh) in the Spark images
697+
// before the actual JVM process is started. The result of this evaluation is that the
698+
// `containerdebug` process is executed in the background.
699+
e.extend(vec![
700+
EnvVar {
701+
name: "CONTAINERDEBUG_LOG_DIRECTORY".into(),
702+
value: Some(format!("{VOLUME_MOUNT_PATH_LOG}/containerdebug")),
703+
value_from: None,
704+
},
705+
EnvVar {
706+
name: "_STACKABLE_PRE_HOOK".into(),
707+
value: Some(format!( "containerdebug --output={VOLUME_MOUNT_PATH_LOG}/containerdebug-state.json --loop &")),
708+
value_from: None,
709+
},
710+
]);
711+
688712
if self.requirements().is_some() {
689713
e.push(EnvVar {
690714
name: "PYTHONPATH".to_string(),
@@ -1385,6 +1409,12 @@ mod tests {
13851409
name: "executor-pod-template".into(),
13861410
..VolumeMount::default()
13871411
},
1412+
VolumeMount {
1413+
mount_path: "/stackable/log".into(),
1414+
mount_propagation: None,
1415+
name: "log".into(),
1416+
..VolumeMount::default()
1417+
},
13881418
VolumeMount {
13891419
mount_path: "/kerberos".into(),
13901420
mount_propagation: None,

rust/operator-binary/src/history/history_controller.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,13 @@ fn build_stateful_set(
527527
.context(InvalidContainerNameSnafu)?
528528
.image_from_product_image(resolved_product_image)
529529
.resources(merged_config.resources.clone().into())
530-
.command(vec!["/bin/bash".to_string()])
530+
.command(vec![
531+
"/bin/bash".to_string(),
532+
"-x".to_string(),
533+
"-euo".to_string(),
534+
"pipefail".to_string(),
535+
"-c".to_string(),
536+
])
531537
.args(command_args(log_dir))
532538
.add_container_port("http", 18080)
533539
.add_container_port("metrics", METRICS_PORT.into())
@@ -751,10 +757,10 @@ fn command_args(logdir: &ResolvedLogDir) -> Vec<String> {
751757
}
752758

753759
command.extend(vec![
760+
format!("containerdebug --output={VOLUME_MOUNT_PATH_LOG}/containerdebug-state.json --loop &"),
754761
format!("/stackable/spark/sbin/start-history-server.sh --properties-file {VOLUME_MOUNT_PATH_CONFIG}/{SPARK_DEFAULTS_FILE_NAME}"),
755762
]);
756-
757-
vec![String::from("-c"), command.join(" && ")]
763+
vec![command.join("\n")]
758764
}
759765

760766
fn labels<'a, T>(

rust/operator-binary/src/spark_k8s_controller.rs

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -466,8 +466,14 @@ fn init_containers(
466466

467467
Some(
468468
jcb.image(job_image)
469-
.command(vec!["/bin/bash".to_string(), "-c".to_string()])
470-
.args(vec![args.join(" && ")])
469+
.command(vec![
470+
"/bin/bash".to_string(),
471+
"-x".to_string(),
472+
"-euo".to_string(),
473+
"pipefail".to_string(),
474+
"-c".to_string(),
475+
])
476+
.args(vec![args.join("\n")])
471477
.add_volume_mount(VOLUME_MOUNT_NAME_JOB, VOLUME_MOUNT_PATH_JOB)
472478
.context(AddVolumeMountSnafu)?
473479
.add_volume_mount(VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_PATH_LOG)
@@ -509,8 +515,14 @@ fn init_containers(
509515
));
510516

511517
rcb.image(&spark_image.image)
512-
.command(vec!["/bin/bash".to_string(), "-c".to_string()])
513-
.args(vec![args.join(" && ")])
518+
.command(vec![
519+
"/bin/bash".to_string(),
520+
"-x".to_string(),
521+
"-euo".to_string(),
522+
"pipefail".to_string(),
523+
"-c".to_string(),
524+
])
525+
.args(vec![args.join("\n")])
514526
.add_volume_mount(VOLUME_MOUNT_NAME_REQ, VOLUME_MOUNT_PATH_REQ)
515527
.context(AddVolumeMountSnafu)?
516528
.add_volume_mount(VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_PATH_LOG)
@@ -549,8 +561,14 @@ fn init_containers(
549561
}
550562
Some(
551563
tcb.image(&spark_image.image)
552-
.command(vec!["/bin/bash".to_string(), "-c".to_string()])
553-
.args(vec![args.join(" && ")])
564+
.command(vec![
565+
"/bin/bash".to_string(),
566+
"-x".to_string(),
567+
"-euo".to_string(),
568+
"pipefail".to_string(),
569+
"-c".to_string(),
570+
])
571+
.args(vec![args.join("\n")])
554572
.add_volume_mount(STACKABLE_TRUST_STORE_NAME, STACKABLE_TRUST_STORE)
555573
.context(AddVolumeMountSnafu)?
556574
.resources(
@@ -858,12 +876,17 @@ fn spark_job(
858876
let mut cb = ContainerBuilder::new(&SparkContainer::SparkSubmit.to_string())
859877
.context(IllegalContainerNameSnafu)?;
860878

861-
let args = [job_commands.join(" ")];
862879
let merged_env = spark_application.merged_env(SparkApplicationRole::Submit, env);
863880

864881
cb.image_from_product_image(spark_image)
865-
.command(vec!["/bin/bash".to_string(), "-c".to_string()])
866-
.args(vec![args.join(" && ")])
882+
.command(vec![
883+
"/bin/bash".to_string(),
884+
"-x".to_string(),
885+
"-euo".to_string(),
886+
"pipefail".to_string(),
887+
"-c".to_string(),
888+
])
889+
.args(vec![job_commands.join("\n")])
867890
.resources(job_config.resources.clone().into())
868891
.add_volume_mounts(spark_application.spark_job_volume_mounts(s3conn, logdir))
869892
.context(AddVolumeMountSnafu)?

tests/templates/kuttl/smoke/03-assert.yaml

Lines changed: 0 additions & 18 deletions
This file was deleted.
Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
---
22
apiVersion: kuttl.dev/v1beta1
33
kind: TestAssert
4+
timeout: 900
5+
---
6+
apiVersion: apps/v1
7+
kind: Deployment
48
metadata:
5-
name: history-api-check
6-
timeout: 180
9+
name: test-minio
10+
status:
11+
readyReplicas: 1
712
---
8-
apiVersion: batch/v1
9-
kind: Job
13+
apiVersion: apps/v1
14+
kind: Deployment
1015
metadata:
11-
name: history-api-check
16+
name: eventlog-minio
1217
status:
13-
succeeded: 1
18+
readyReplicas: 1
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# This test checks if the containerdebug-state.json file is present and valid
3+
apiVersion: kuttl.dev/v1beta1
4+
kind: TestAssert
5+
timeout: 60
6+
commands:
7+
- script: kubectl exec -n $NAMESPACE --container spark-history spark-history-node-default-0 -- cat /stackable/log/containerdebug-state.json | jq --exit-status '"valid JSON"'
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
# This test checks if the containerdebug-state.json file is present and valid.
3+
#
4+
# It needs to run as soon as the spark application has been submitted because
5+
# once it is completed the pods are terminated.
6+
#
7+
# Unfortunately it's impossible to test the driver and the executor pods in a
8+
# reliable way.
9+
#
10+
apiVersion: kuttl.dev/v1beta1
11+
kind: TestAssert
12+
commands:
13+
- script: |
14+
SPARK_SUBMIT_POD=$(kubectl get -n $NAMESPACE pods --field-selector=status.phase=Running --selector batch.kubernetes.io/job-name=spark-pi-s3-1 -o jsonpath='{.items[0].metadata.name}')
15+
kubectl exec -n $NAMESPACE --container spark-submit $SPARK_SUBMIT_POD -- cat /stackable/log/containerdebug-state.json | jq --exit-status '"valid JSON"'
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestAssert
4+
metadata:
5+
name: history-api-check
6+
timeout: 180
7+
---
8+
apiVersion: batch/v1
9+
kind: Job
10+
metadata:
11+
name: history-api-check
12+
status:
13+
succeeded: 1

0 commit comments

Comments
 (0)