From e85be4c9d64dabb471d750fe7641d6e3984f85f2 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Tue, 21 May 2024 18:01:08 +0200 Subject: [PATCH 1/5] fix: allow fine-granular resource CPU settings --- .../pages/usage-guide/resources.adoc | 12 ++++++-- rust/crd/src/lib.rs | 29 ++++++++++--------- .../kuttl/resources/10-assert.yaml.j2 | 6 ++-- .../resources/10-deploy-spark-app.yaml.j2 | 2 +- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/docs/modules/spark-k8s/pages/usage-guide/resources.adoc b/docs/modules/spark-k8s/pages/usage-guide/resources.adoc index 9224a2a2..a2daecb7 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/resources.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/resources.adoc @@ -58,26 +58,32 @@ To illustrate resource configuration consider the use-case where resources are d === CPU -CPU request and limit will be rounded up to the next integer value, resulting in the following: +CPU request and limit will be used as defined in the custom resource resulting in the following: |=== -|CRD |Spark conf +|CRD |spark.kubernetes.{driver/executor} cores|spark.{driver/executor} cores (rounded up) +|1800m |1800m |2 +|100m |100m |1 +|1.5 |1.5 |2 +|2 |2 |2 |=== -Spark allows CPU limits to be set for the driver and executor using Spark settings (`spark.{driver|executor}.cores}`) as well as Kubernetes-specific ones (`spark.kubernetes.{driver,executor}.{request|limit}.cores`). `spark.kubernetes.executor.request.cores` takes precedence over `spark.executor.cores` in determining the pod CPU request, but does not affect task parallelism (the number of tasks an executor can run concurrently), so for this reason `spark.executor.cores` is set to the value of `spark.kubernetes.executor.limit.cores`. +`spark.kubernetes.{driver,executor}.{request|limit}.cores` determine the actual pod CPU request and are taken directly from the manifest as defined by the user. +`spark.{driver|executor}.cores}` are set to the rounded(-up) value of the manifest settings. +Task parallelism (the number of tasks an executor can run concurrently), is determined by `spark.executor.cores`. === Memory diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 0b07b1fd..b526726e 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -915,15 +915,17 @@ fn resources_to_driver_props( .. } = &driver_config.resources { - let min_cores = cores_from_quantity(min.0.clone())?; - let max_cores = cores_from_quantity(max.0.clone())?; - // will have default value from resources to apply if nothing set specifically - props.insert("spark.driver.cores".to_string(), max_cores.clone()); + let driver_cores = cores_from_quantity(max.0.clone())?; + // take rounded value for driver.cores but actual values for the pod + props.insert("spark.driver.cores".to_string(), driver_cores.clone()); props.insert( "spark.kubernetes.driver.request.cores".to_string(), - min_cores, + min.0.clone(), + ); + props.insert( + "spark.kubernetes.driver.limit.cores".to_string(), + max.0.clone(), ); - props.insert("spark.kubernetes.driver.limit.cores".to_string(), max_cores); } if let Resources { @@ -955,17 +957,16 @@ fn resources_to_executor_props( .. } = &executor_config.resources { - let min_cores = cores_from_quantity(min.0.clone())?; - let max_cores = cores_from_quantity(max.0.clone())?; - // will have default value from resources to apply if nothing set specifically - props.insert("spark.executor.cores".to_string(), max_cores.clone()); + let executor_cores = cores_from_quantity(max.0.clone())?; + // take rounded value for executor.cores (to determine the parallelism) but actual values for the pod + props.insert("spark.executor.cores".to_string(), executor_cores.clone()); props.insert( "spark.kubernetes.executor.request.cores".to_string(), - min_cores, + min.0.clone(), ); props.insert( "spark.kubernetes.executor.limit.cores".to_string(), - max_cores, + max.0.clone(), ); } @@ -1154,7 +1155,7 @@ mod tests { ), ( "spark.kubernetes.driver.request.cores".to_string(), - "1".to_string(), + "250m".to_string(), ), ] .into_iter() @@ -1194,7 +1195,7 @@ mod tests { ("spark.executor.memory".to_string(), "128m".to_string()), // 128 and not 512 because memory overhead is subtracted ( "spark.kubernetes.executor.request.cores".to_string(), - "1".to_string(), + "250m".to_string(), ), ( "spark.kubernetes.executor.limit.cores".to_string(), diff --git a/tests/templates/kuttl/resources/10-assert.yaml.j2 b/tests/templates/kuttl/resources/10-assert.yaml.j2 index eb4e706b..55c9e63c 100644 --- a/tests/templates/kuttl/resources/10-assert.yaml.j2 +++ b/tests/templates/kuttl/resources/10-assert.yaml.j2 @@ -33,10 +33,10 @@ spec: resources: # these resources are set via Spark submit properties like "spark.driver.cores" limits: - cpu: "2" + cpu: 1200m memory: 1Gi requests: - cpu: "1" + cpu: 300m memory: 1Gi --- apiVersion: v1 @@ -55,5 +55,5 @@ spec: cpu: "2" memory: 1Gi requests: - cpu: "2" + cpu: 1250m memory: 1Gi diff --git a/tests/templates/kuttl/resources/10-deploy-spark-app.yaml.j2 b/tests/templates/kuttl/resources/10-deploy-spark-app.yaml.j2 index cc8b9e5a..f56bef9b 100644 --- a/tests/templates/kuttl/resources/10-deploy-spark-app.yaml.j2 +++ b/tests/templates/kuttl/resources/10-deploy-spark-app.yaml.j2 @@ -36,7 +36,7 @@ spec: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} resources: cpu: - min: 200m + min: 300m max: 1200m memory: limit: 1024Mi From 690df415cd6738e660f976dd82b5751f5d21ae6a Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Tue, 21 May 2024 18:04:45 +0200 Subject: [PATCH 2/5] changelog --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b9397a4..a450ce46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,14 @@ All notable changes to this project will be documented in this file. ### Changed -- Update Rust dependency versions, most notably operator-rs 0.67.1 ([#401]) +- Update Rust dependency versions, most notably operator-rs 0.67.1 ([#401] + +### Fixed + +- Use actual values of resource CPU settings for pod values, but still rounding up for parallelism ([#408]). [#401]: https://github.com/stackabletech/spark-k8s-operator/pull/401 +[#408]: https://github.com/stackabletech/spark-k8s-operator/pull/408 ## [24.3.0] - 2024-03-20 From a935f8bd2cece8e8846f0737c151ffab90867a23 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Wed, 22 May 2024 09:46:35 +0200 Subject: [PATCH 3/5] Update CHANGELOG.md Co-authored-by: Sebastian Bernauer --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a450ce46..89e1fd4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file. ### Changed -- Update Rust dependency versions, most notably operator-rs 0.67.1 ([#401] +- Update Rust dependency versions, most notably operator-rs 0.67.1 ([#401]) ### Fixed From 6537f17f0eb16973a40fecba316be56fb2ae24c1 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Wed, 22 May 2024 10:23:18 +0200 Subject: [PATCH 4/5] adding note re. change-in-behaviour in changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89e1fd4e..9f62db05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,9 @@ All notable changes to this project will be documented in this file. ### Fixed -- Use actual values of resource CPU settings for pod values, but still rounding up for parallelism ([#408]). +- BREAKING (behaviour): Specified CPU resources are now applied correctly (instead of rounding it to the next whole number). + This might affect your jobs, as they now e.g. only have 200m CPU resources requested instead of the 1000m it had so far, + meaning they might slow down significantly([#408]). [#401]: https://github.com/stackabletech/spark-k8s-operator/pull/401 [#408]: https://github.com/stackabletech/spark-k8s-operator/pull/408 From 5bac10237bfbb70704bedd0f80da5cace659ea04 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Wed, 22 May 2024 10:26:07 +0200 Subject: [PATCH 5/5] Update CHANGELOG.md Co-authored-by: Sebastian Bernauer --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f62db05..f3243125 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ All notable changes to this project will be documented in this file. - BREAKING (behaviour): Specified CPU resources are now applied correctly (instead of rounding it to the next whole number). This might affect your jobs, as they now e.g. only have 200m CPU resources requested instead of the 1000m it had so far, - meaning they might slow down significantly([#408]). + meaning they might slow down significantly ([#408]). [#401]: https://github.com/stackabletech/spark-k8s-operator/pull/401 [#408]: https://github.com/stackabletech/spark-k8s-operator/pull/408