From 8081b51ed8dce5cc4aa2cb66e9fac11e917358df Mon Sep 17 00:00:00 2001 From: Pavan Karkun Date: Thu, 7 Nov 2024 14:13:16 +0000 Subject: [PATCH 01/17] Do not use beta API for hyperdisk in multi-writer mode. --- pkg/gce-cloud-provider/compute/gce-compute.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/gce-cloud-provider/compute/gce-compute.go b/pkg/gce-cloud-provider/compute/gce-compute.go index c2c2b7603..0cd4ed343 100644 --- a/pkg/gce-cloud-provider/compute/gce-compute.go +++ b/pkg/gce-cloud-provider/compute/gce-compute.go @@ -651,7 +651,7 @@ func (cloud *CloudProvider) insertRegionalDisk( gceAPIVersion = GCEAPIVersionV1 ) - if multiWriter { + if multiWriter && !strings.Contains(params.DiskType, "hyperdisk") { gceAPIVersion = GCEAPIVersionBeta } @@ -778,7 +778,7 @@ func (cloud *CloudProvider) insertZonalDisk( opName string gceAPIVersion = GCEAPIVersionV1 ) - if multiWriter { + if multiWriter && !strings.Contains(params.DiskType, "hyperdisk") { gceAPIVersion = GCEAPIVersionBeta } From 41685427297d718c0059f5858295b5d042005b50 Mon Sep 17 00:00:00 2001 From: Pavan Karkun Date: Thu, 14 Nov 2024 05:42:36 +0000 Subject: [PATCH 02/17] Update GetMultiWriter() to reflect support for multi-writer in v1 disks --- pkg/gce-cloud-provider/compute/cloud-disk.go | 4 ++-- pkg/gce-cloud-provider/compute/gce-compute.go | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pkg/gce-cloud-provider/compute/cloud-disk.go b/pkg/gce-cloud-provider/compute/cloud-disk.go index 6790992c4..54c6e4655 100644 --- a/pkg/gce-cloud-provider/compute/cloud-disk.go +++ b/pkg/gce-cloud-provider/compute/cloud-disk.go @@ -215,8 +215,8 @@ func (d *CloudDisk) GetKMSKeyName() string { func (d *CloudDisk) GetMultiWriter() bool { switch { - case d.disk != nil: - return false + case d.disk != nil && d.disk.AccessMode == "READ_WRITE_MANY": + return true case d.betaDisk != nil: return d.betaDisk.MultiWriter default: diff --git a/pkg/gce-cloud-provider/compute/gce-compute.go b/pkg/gce-cloud-provider/compute/gce-compute.go index 0cd4ed343..eb7155ffe 100644 --- a/pkg/gce-cloud-provider/compute/gce-compute.go +++ b/pkg/gce-cloud-provider/compute/gce-compute.go @@ -553,9 +553,6 @@ func convertV1DiskToBetaDisk(v1Disk *computev1.Disk) *computebeta.Disk { AccessMode: v1Disk.AccessMode, } - // Hyperdisk doesn't currently support multiWriter (https://cloud.google.com/compute/docs/disks/hyperdisks#limitations), - // but if multiWriter + hyperdisk is supported in the future, we want the PDCSI driver to support this feature without - // any additional code change. if v1Disk.ProvisionedIops > 0 { betaDisk.ProvisionedIops = v1Disk.ProvisionedIops } @@ -619,9 +616,6 @@ func convertBetaDiskToV1Disk(betaDisk *computebeta.Disk) *computev1.Disk { AccessMode: betaDisk.AccessMode, } - // Hyperdisk doesn't currently support multiWriter (https://cloud.google.com/compute/docs/disks/hyperdisks#limitations), - // but if multiWriter + hyperdisk is supported in the future, we want the PDCSI driver to support this feature without - // any additional code change. if betaDisk.ProvisionedIops > 0 { v1Disk.ProvisionedIops = betaDisk.ProvisionedIops } @@ -651,6 +645,7 @@ func (cloud *CloudProvider) insertRegionalDisk( gceAPIVersion = GCEAPIVersionV1 ) + // Use beta API for non-hyperdisk types in multi-writer mode. if multiWriter && !strings.Contains(params.DiskType, "hyperdisk") { gceAPIVersion = GCEAPIVersionBeta } @@ -778,6 +773,8 @@ func (cloud *CloudProvider) insertZonalDisk( opName string gceAPIVersion = GCEAPIVersionV1 ) + + // Use beta API for non-hyperdisk types in multi-writer mode. if multiWriter && !strings.Contains(params.DiskType, "hyperdisk") { gceAPIVersion = GCEAPIVersionBeta } From 1a7eeebcca1f9431c4d84476b75a67b18d84aecc Mon Sep 17 00:00:00 2001 From: Travis Xiang Date: Fri, 15 Nov 2024 18:32:49 +0000 Subject: [PATCH 03/17] Upgrade resizer to v1.12.0 --- deploy/kubernetes/images/stable-master/image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/kubernetes/images/stable-master/image.yaml b/deploy/kubernetes/images/stable-master/image.yaml index c90f568af..2d2df2367 100644 --- a/deploy/kubernetes/images/stable-master/image.yaml +++ b/deploy/kubernetes/images/stable-master/image.yaml @@ -22,7 +22,7 @@ metadata: name: imagetag-csi-resizer imageTag: name: registry.k8s.io/sig-storage/csi-resizer - newTag: "v1.11.1" + newTag: "v1.12.0" --- apiVersion: builtin From 6039d2ba304c0491580e0388e7f79bdf3e8450e8 Mon Sep 17 00:00:00 2001 From: Travis Xiang Date: Fri, 15 Nov 2024 18:56:30 +0000 Subject: [PATCH 04/17] Upgrade rc resizer to v1.12.0 --- .../kubernetes/images/prow-stable-sidecar-rc-master/image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml b/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml index d140cffa2..55c27b516 100644 --- a/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml +++ b/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml @@ -23,7 +23,7 @@ metadata: name: imagetag-csi-resize-prow-rc imageTag: name: registry.k8s.io/sig-storage/csi-resizer - newTag: "v1.11.1" + newTag: "v1.12.0" --- apiVersion: builtin kind: ImageTagTransformer From 51fb1d0c02da553208a0b95560b8d94e9d9c01a1 Mon Sep 17 00:00:00 2001 From: Travis Xiang Date: Tue, 19 Nov 2024 16:50:42 +0000 Subject: [PATCH 05/17] Require VACs to use SI units --- examples/kubernetes/demo-vol-create.yaml | 4 ++-- pkg/common/parameters.go | 2 +- pkg/common/parameters_test.go | 2 +- pkg/gce-pd-csi-driver/controller_test.go | 14 +++++++------- test/e2e/tests/single_zone_e2e_test.go | 4 ++-- .../config/hdb-volumeattributesclass.yaml | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/kubernetes/demo-vol-create.yaml b/examples/kubernetes/demo-vol-create.yaml index 1ca0e0556..5fa536042 100644 --- a/examples/kubernetes/demo-vol-create.yaml +++ b/examples/kubernetes/demo-vol-create.yaml @@ -16,7 +16,7 @@ metadata: driverName: pd.csi.storage.gke.io parameters: iops: "3000" - throughput: "150" + throughput: "150Mi" --- apiVersion: storage.k8s.io/v1beta1 kind: VolumeAttributesClass @@ -25,7 +25,7 @@ metadata: driverName: pd.csi.storage.gke.io parameters: iops: "3013" - throughput: "151" + throughput: "151Mi" --- apiVersion: v1 kind: PersistentVolumeClaim diff --git a/pkg/common/parameters.go b/pkg/common/parameters.go index e80346ed8..872837193 100644 --- a/pkg/common/parameters.go +++ b/pkg/common/parameters.go @@ -343,7 +343,7 @@ func ExtractModifyVolumeParameters(parameters map[string]string) (ModifyVolumePa } modifyVolumeParams.IOPS = &iops case "throughput": - throughput, err := ConvertStringToInt64(value) + throughput, err := ConvertMiStringToInt64(value) if err != nil { return ModifyVolumeParameters{}, fmt.Errorf("parameters contain invalid throughput parameter: %w", err) } diff --git a/pkg/common/parameters_test.go b/pkg/common/parameters_test.go index cb60ecff0..35a116325 100644 --- a/pkg/common/parameters_test.go +++ b/pkg/common/parameters_test.go @@ -485,7 +485,7 @@ func TestSnapshotParameters(t *testing.T) { func TestExtractModifyVolumeParameters(t *testing.T) { parameters := map[string]string{ "iops": "1000", - "throughput": "500", + "throughput": "500Mi", } iops := int64(1000) diff --git a/pkg/gce-pd-csi-driver/controller_test.go b/pkg/gce-pd-csi-driver/controller_test.go index 0e79a3663..06db7222e 100644 --- a/pkg/gce-pd-csi-driver/controller_test.go +++ b/pkg/gce-pd-csi-driver/controller_test.go @@ -1789,7 +1789,7 @@ func TestCreateVolumeWithVolumeAttributeClassParameters(t *testing.T) { }, }, }, - MutableParameters: map[string]string{"iops": "20000", "throughput": "600"}, + MutableParameters: map[string]string{"iops": "20000", "throughput": "600Mi"}, }, expIops: 20000, expThroughput: 600, @@ -1822,7 +1822,7 @@ func TestCreateVolumeWithVolumeAttributeClassParameters(t *testing.T) { }, }, }, - MutableParameters: map[string]string{"iops": "20000", "throughput": "600"}, + MutableParameters: map[string]string{"iops": "20000", "throughput": "600Mi"}, }, expIops: 0, expThroughput: 0, @@ -1890,7 +1890,7 @@ func TestVolumeModifyOperation(t *testing.T) { name: "Update volume with valid parameters", req: &csi.ControllerModifyVolumeRequest{ VolumeId: testVolumeID, - MutableParameters: map[string]string{"iops": "20000", "throughput": "600"}, + MutableParameters: map[string]string{"iops": "20000", "throughput": "600Mi"}, }, diskType: "hyperdisk-balanced", params: &common.DiskParameters{ @@ -1906,7 +1906,7 @@ func TestVolumeModifyOperation(t *testing.T) { name: "Update volume with invalid parameters", req: &csi.ControllerModifyVolumeRequest{ VolumeId: testVolumeID, - MutableParameters: map[string]string{"iops": "0", "throughput": "0"}, + MutableParameters: map[string]string{"iops": "0", "throughput": "0Mi"}, }, diskType: "hyperdisk-balanced", params: &common.DiskParameters{ @@ -1922,7 +1922,7 @@ func TestVolumeModifyOperation(t *testing.T) { name: "Update volume with valid parameters but invalid disk type", req: &csi.ControllerModifyVolumeRequest{ VolumeId: testVolumeID, - MutableParameters: map[string]string{"iops": "20000", "throughput": "600"}, + MutableParameters: map[string]string{"iops": "20000", "throughput": "600Mi"}, }, diskType: "pd-ssd", params: &common.DiskParameters{ @@ -2053,7 +2053,7 @@ func TestVolumeModifyErrorHandling(t *testing.T) { }, }, modifyReq: &csi.ControllerModifyVolumeRequest{ - MutableParameters: map[string]string{"iops": "3001", "throughput": "151"}, + MutableParameters: map[string]string{"iops": "3001", "throughput": "151Mi"}, }, modifyVolumeErrors: map[*meta.Key]error{ meta.ZonalKey(name, "us-central1-a"): &googleapi.Error{ @@ -2089,7 +2089,7 @@ func TestVolumeModifyErrorHandling(t *testing.T) { }, }, modifyReq: &csi.ControllerModifyVolumeRequest{ - MutableParameters: map[string]string{"iops": "10000", "throughput": "2400"}, + MutableParameters: map[string]string{"iops": "10000", "throughput": "2400Mi"}, }, modifyVolumeErrors: map[*meta.Key]error{ meta.ZonalKey(name, "us-central1-a"): &googleapi.Error{Code: int(codes.InvalidArgument), Message: "InvalidArgument"}, diff --git a/test/e2e/tests/single_zone_e2e_test.go b/test/e2e/tests/single_zone_e2e_test.go index cd60d8b73..c77c7b911 100644 --- a/test/e2e/tests/single_zone_e2e_test.go +++ b/test/e2e/tests/single_zone_e2e_test.go @@ -1622,7 +1622,7 @@ var _ = Describe("GCE PD CSI Driver", func() { stringPtr(provisionedIOPSOnCreateHdb), stringPtr(provisionedThroughputOnCreateHdb), stringPtr("3013"), - stringPtr("181"), + stringPtr("181Mi"), ), Entry( "for hyperdisk-extreme", @@ -1640,7 +1640,7 @@ var _ = Describe("GCE PD CSI Driver", func() { nil, stringPtr(provisionedThroughputOnCreate), nil, - stringPtr("70"), + stringPtr("70Mi"), ), ) }) diff --git a/test/k8s-integration/config/hdb-volumeattributesclass.yaml b/test/k8s-integration/config/hdb-volumeattributesclass.yaml index f032912d2..73d5ff4f9 100644 --- a/test/k8s-integration/config/hdb-volumeattributesclass.yaml +++ b/test/k8s-integration/config/hdb-volumeattributesclass.yaml @@ -5,4 +5,4 @@ metadata: driverName: pd.csi.storage.gke.io parameters: iops: "3600" - throughput: "290" \ No newline at end of file + throughput: "290Mi" \ No newline at end of file From 50f8af2b58363d17d9c5f517921e2b4b01b157b3 Mon Sep 17 00:00:00 2001 From: Peter Schuurman Date: Wed, 27 Nov 2024 10:27:25 -0800 Subject: [PATCH 06/17] Fix ./hack/verify-docker-deps.sh script to run on build platform --- Dockerfile | 24 ++++++------------------ Makefile | 28 ++++++++++++++++------------ hack/print-missing-deps.sh | 24 ++++++++++++++++++++++-- hack/verify-docker-deps.sh | 3 ++- 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/Dockerfile b/Dockerfile index aa3e287a5..d439ed960 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM --platform=$BUILDPLATFORM golang:1.23.0 as builder +FROM --platform=$BUILDPLATFORM golang:1.23.0 AS builder ARG STAGINGVERSION ARG TARGETPLATFORM @@ -23,24 +23,24 @@ RUN GOARCH=$(echo $TARGETPLATFORM | cut -f2 -d '/') GCE_PD_CSI_STAGING_VERSION=$ # Start from Kubernetes Debian base. -FROM gke.gcr.io/debian-base:bookworm-v1.0.4-gke.2 as debian +FROM gke.gcr.io/debian-base:bookworm-v1.0.4-gke.2 AS debian # Install necessary dependencies # google_nvme_id script depends on the following packages: nvme-cli, xxd, bash RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs nvme-cli xxd bash # Since we're leveraging apt to pull in dependencies, we use `gcr.io/distroless/base` because it includes glibc. -FROM gcr.io/distroless/base-debian12 as distroless-base +FROM gcr.io/distroless/base-debian12 AS distroless-base # The distroless amd64 image has a target triplet of x86_64 FROM distroless-base AS distroless-amd64 -ENV LIB_DIR_PREFIX x86_64 +ENV LIB_DIR_PREFIX=x86_64 # The distroless arm64 image has a target triplet of aarch64 FROM distroless-base AS distroless-arm64 -ENV LIB_DIR_PREFIX aarch64 +ENV LIB_DIR_PREFIX=aarch64 -FROM distroless-$TARGETARCH as output-image +FROM distroless-$TARGETARCH # Copy necessary dependencies into distroless base. COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/bin/gce-pd-csi-driver /gce-pd-csi-driver @@ -119,16 +119,4 @@ COPY --from=debian /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libblkid.so.1 \ # Copy NVME support required script and rules into distroless base. COPY deploy/kubernetes/udev/google_nvme_id /lib/udev_containerized/google_nvme_id -# Build stage used for validation of the output-image -# See validate-container-linux-* targets in Makefile -FROM output-image as validation-image - -COPY --from=debian /usr/bin/ldd /usr/bin/find /usr/bin/xargs /usr/bin/ -COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/hack/print-missing-deps.sh /print-missing-deps.sh -SHELL ["/bin/bash", "-c"] -RUN /print-missing-deps.sh - -# Final build stage, create the real Docker image with ENTRYPOINT -FROM output-image - ENTRYPOINT ["/gce-pd-csi-driver"] diff --git a/Makefile b/Makefile index e77796aff..bfd6314ba 100644 --- a/Makefile +++ b/Makefile @@ -73,20 +73,24 @@ build-and-push-multi-arch-debug: build-and-push-container-linux-debug build-and- push-container: build-container # Used by hack/verify-docker-deps.sh, not used for building artifacts -validate-container-linux-amd64: init-buildx - $(DOCKER) buildx build --platform=linux/amd64 \ - -t validation_linux_amd64 \ - --target validation-image \ - --build-arg BUILDPLATFORM=linux \ - --build-arg STAGINGVERSION=$(STAGINGVERSION) . +validate-container-linux-amd64: build-and-load-container-linux-amd64 + ./hack/print-missing-deps.sh $(STAGINGIMAGE):$(STAGINGVERSION)_linux_amd64 # Used by hack/verify-docker-deps.sh, not used for building artifacts -validate-container-linux-arm64: init-buildx - $(DOCKER) buildx build --platform=linux/arm64 \ - -t validation_linux_arm64 \ - --target validation-image \ - --build-arg BUILDPLATFORM=linux \ - --build-arg STAGINGVERSION=$(STAGINGVERSION) . +validate-container-linux-arm64: build-and-load-container-linux-arm64 + ./hack/print-missing-deps.sh $(STAGINGIMAGE):$(STAGINGVERSION)_linux_arm64 + +validate-container-linux: validate-container-linux-amd64 validate-container-linux-arm64 + +build-and-load-container-linux-amd64: require-GCE_PD_CSI_STAGING_IMAGE init-buildx + $(DOCKER) buildx build --platform=linux/amd64 \ + -t $(STAGINGIMAGE):$(STAGINGVERSION)_linux_amd64 \ + --build-arg STAGINGVERSION=$(STAGINGVERSION) --load . + +build-and-load-container-linux-arm64: require-GCE_PD_CSI_STAGING_IMAGE init-buildx + $(DOCKER) buildx build --file=Dockerfile --platform=linux/arm64 \ + -t $(STAGINGIMAGE):$(STAGINGVERSION)_linux_arm64 \ + --build-arg STAGINGVERSION=$(STAGINGVERSION) --load . build-and-push-container-linux-amd64: require-GCE_PD_CSI_STAGING_IMAGE init-buildx $(DOCKER) buildx build --platform=linux/amd64 \ diff --git a/hack/print-missing-deps.sh b/hack/print-missing-deps.sh index 9c1708354..a41e09b09 100755 --- a/hack/print-missing-deps.sh +++ b/hack/print-missing-deps.sh @@ -20,13 +20,33 @@ set -o pipefail echo "Verifying Docker Executables have appropriate dependencies" +TEMP_DIR="$(mktemp -d)" +trap 'rm -rf -- "$TEMP_DIR"' EXIT + +export CONTAINER_IMAGE="$1" +export CONTAINER_EXPORT_DIR="$TEMP_DIR/image_dir" + +extractContainerImage() { + CONTAINER_ID="$(docker create "$CONTAINER_IMAGE")" + CONTAINER_EXPORT_TAR="$TEMP_DIR/image.tar" + docker export "$CONTAINER_ID" -o "$CONTAINER_EXPORT_TAR" + mkdir -p "$CONTAINER_EXPORT_DIR" + tar xf "$CONTAINER_EXPORT_TAR" -C "$CONTAINER_EXPORT_DIR" +} + +printNeededDeps() { + readelf -d "$@" 2>&1 | grep NEEDED | awk '{print $5}' | sed -e 's@\[@@g' -e 's@\]@@g' +} + printMissingDep() { - if /usr/bin/ldd "$@" | grep "not found"; then + if ! find "$CONTAINER_EXPORT_DIR" -name "$@" > /dev/null; then echo "!!! Missing deps for $@ !!!" exit 1 fi } +export -f printNeededDeps export -f printMissingDep -/usr/bin/find / -type f -executable -print | /usr/bin/xargs -I {} /bin/bash -c 'printMissingDep "{}"' +extractContainerImage +/usr/bin/find "$CONTAINER_EXPORT_DIR" -type f -executable -print | /usr/bin/xargs -I {} /bin/bash -c 'printNeededDeps "{}"' | sort | uniq | /usr/bin/xargs -I {} /bin/bash -c 'printMissingDep "{}"' diff --git a/hack/verify-docker-deps.sh b/hack/verify-docker-deps.sh index 0324df3e8..ef35c67e4 100755 --- a/hack/verify-docker-deps.sh +++ b/hack/verify-docker-deps.sh @@ -22,4 +22,5 @@ echo "Verifying Docker Image Dependencies" PKG_ROOT=$(git rev-parse --show-toplevel) -make -C "${PKG_ROOT}" validate-container-linux-amd64 validate-container-linux-arm64 +export GCE_PD_CSI_STAGING_IMAGE=validation-image +make -C "${PKG_ROOT}" validate-container-linux From 4249ee3fe376242294ce975da2c6187424660b7d Mon Sep 17 00:00:00 2001 From: Matthew Cary Date: Wed, 27 Nov 2024 11:27:22 -0800 Subject: [PATCH 07/17] Remove make from verify-all in order to reduce presubmit flakes --- hack/verify-all.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/hack/verify-all.sh b/hack/verify-all.sh index c73cbcb64..b737bb5b7 100755 --- a/hack/verify-all.sh +++ b/hack/verify-all.sh @@ -23,5 +23,3 @@ PKG_ROOT=$(git rev-parse --show-toplevel) "${PKG_ROOT}/hack/verify-gofmt.sh" "${PKG_ROOT}/hack/verify-govet.sh" "${PKG_ROOT}/hack/verify-docker-deps.sh" - -make -C "${PKG_ROOT}" all From 6449ad193bc972d329fd23b8aa39a46137043af9 Mon Sep 17 00:00:00 2001 From: Sneha Aradhey Date: Wed, 23 Oct 2024 17:15:44 +0000 Subject: [PATCH 08/17] update changelog --- CHANGELOG/CHANGELOG-1.12.md | 42 ++++++++++++++++++++++++++++++++++++ CHANGELOG/CHANGELOG-1.13.md | 43 +++++++++++++++++++++++++++++++++++++ CHANGELOG/CHANGELOG-1.14.md | 23 ++++++++++++++++++++ CHANGELOG/CHANGELOG-1.15.md | 36 +++++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 CHANGELOG/CHANGELOG-1.15.md diff --git a/CHANGELOG/CHANGELOG-1.12.md b/CHANGELOG/CHANGELOG-1.12.md index c478d481f..9c88122d1 100644 --- a/CHANGELOG/CHANGELOG-1.12.md +++ b/CHANGELOG/CHANGELOG-1.12.md @@ -1,3 +1,45 @@ +# v1.12.13 - Changelog since v1.12.11 + +## Changes by Kind + +### Bug +- Update Go version and dependencies to fix CVE-2024-24790,CVE-2024-24789 by @Sneha-at in #1851 + +# v1.12.12 - Changelog since v1.12.11 + +## Changes by Kind + +### Bug +- Automated cherry pick of #1658: Add support for checking if a device is being used by a filesystem by @pwschuurman in #1845 + + +# v1.12.11 - Changelog since v1.12.10 + +## Changes by Kind + +### Bug +- [release-1.12] Update debian image from bullseye to bookworm to fix CVEs by @k8s-infra-cherrypick-robot in #1735 +- Reverting the Dockerfile debian image from bookworm to bullseye due to regression by @Sneha-at in #1774 +- [release-1.12] Return Unavailable for 'connection reset by peer' errors by @k8s-infra-cherrypick-robot in #1813 +- Automated cherry pick of #1708: Properly unwrap gce-compute error code. by @hime in #1840 + +# v1.12.10 - Changelog since v1.12.9 + +## Changes by Kind + +### Bug +- Fix CVE-2023-45288 by @dannawang0221 in #1682 + + +# v1.12.9 - Changelog since v1.12.8 + +## Changes by Kind + +### Bug + +- Change GetDisk error reporting to temporary in CreateVolume codepath ([#1600])https://github.com/kubernetes-sigs/gcp-compute-persistent-disk-csi-driver/pull/1600), [@k8s-infra-cherrypick-robot](https://github.com/k8s-infra-cherrypick-robot)) +- [release-1.12] Fix nvme path filtering logic for udevadm trigger by @k8s-infra-cherrypick-robot in #1647 + # v1.12.8 - Changelog since v1.12.7 ## Changes by Kind diff --git a/CHANGELOG/CHANGELOG-1.13.md b/CHANGELOG/CHANGELOG-1.13.md index 8d6a80b0a..e96b2f4ff 100644 --- a/CHANGELOG/CHANGELOG-1.13.md +++ b/CHANGELOG/CHANGELOG-1.13.md @@ -1,3 +1,46 @@ +# v1.13.8 - Changelog since v1.13.7 + +## Changes by Kind + +### Uncategorized + +- [release-1.13] Properly unwrap gce-compute error code. by @hime in #1839 + + +# v1.13.7 - Changelog since v1.13.6 + +## Changes by Kind + +### Uncategorized + +- [release-1.13] Reassign error returned from validateStoragePools so InvalidArgument is recorded by @k8s-infra-cherrypick-robot in #1721 +- [release-1.13] Return Unavailable for 'connection reset by peer' errors by @k8s-infra-cherrypick-robot in #1724 +- [release-1.13] Update debian image from bullseye to bookworm to fix CVEs by @k8s-infra-cherrypick-robot in #1734 +- Reverting the Dockerfile debian image from bookworm to bullseye due to regression by @Sneha-at in #1775 +- Automated cherry pick of #1658: Add support for checking if a device is being used by a by @pwschuurman in #1805 + + +# v1.13.6 - Changelog since v1.13.5 + +## Changes by Kind + +### Uncategorized + +- Automated cherry pick of #1666: migrate hyperdisk/chd/storagepools to GCE v1 disk API +#1667: remove support for GCE Alpha Disks by @amacaskill in #1669 +- [release-1.13] Record original error code to operation_errors metric for temporary errors by @k8s-infra-cherrypick-robot in #1672 +- [release-1.13] Remove short variable declaration from validateStoragePools by @k8s-infra-cherrypick-robot in #1674 +- Fix CVE-2023-45288 by @dannawang0221 in #1683 + + +# v1.13.5 - Changelog since v1.13.4 + +## Changes by Kind + +### Uncategorized +_Nothing has changed._ + + # v1.13.4 - Changelog since v1.13.3 ## Changes by Kind diff --git a/CHANGELOG/CHANGELOG-1.14.md b/CHANGELOG/CHANGELOG-1.14.md index 338f101e4..f1da406f4 100644 --- a/CHANGELOG/CHANGELOG-1.14.md +++ b/CHANGELOG/CHANGELOG-1.14.md @@ -1,3 +1,26 @@ +# v1.14.3 - Changelog since v1.14.2 + +## Changes by Kind + +### Bug or Regression +- Update base image to bookworm-v1.0.4-gke.2 by @k8s-infra-cherrypick-robot in #1834 + + +# v1.14.2 - Changelog since v1.14.1 + +## Changes by Kind + +### Uncategorized +- [release-1.14] Change OPERATION_CANCELED_BY_USER to Canceled instead of Aborted by @k8s-infra-cherrypick-robot in #1790 + +# v1.14.1 - Changelog since v1.14.0 + +## Changes by Kind + +### Bug or Regression +- [release-1.14] Adding missing libgpg-error.so.0 required by nvme-cli by @k8s-infra-cherrypick-robot in #1765 + + # v1.14.0 - Changelog since v1.13.6 ## Changes by Kind diff --git a/CHANGELOG/CHANGELOG-1.15.md b/CHANGELOG/CHANGELOG-1.15.md new file mode 100644 index 000000000..c612677e2 --- /dev/null +++ b/CHANGELOG/CHANGELOG-1.15.md @@ -0,0 +1,36 @@ +# v1.15.2 - Changelog since v1.15.1 + +- [release-1.15] Map RESOURCE_OPERATION_RATE_EXCEEDED to ResourceExhausted by @k8s-infra-cherrypick-robot in #1848 +- [release-1.15] Add StatusConflict http kind to userErrorCodeMap. by @k8s-infra-cherrypick-robot in #1850 +- Automated cherry pick of #1826: Add ControllerModifyVolume E2E tests +- #1836: Create documentation for ControllerModifyVolume and controller default +- #1838: Enable VolumeAttributesClass feature gate for CI runs + + +# v1.15.1 - Changelog since v1.15.0 +### Bug +- Update base image to bookworm-v1.0.4-gke.2 by @k8s-infra-cherrypick-robot in #1833 + +# v1.15.0 - Changelog since v1.14.3 + +## Changes by Kind + +### Bug or Regression +- Adding missing libgpg-error.so.0 required by nvme-cli by @pwschuurman in #1760 +- Format byte array error output from google_nvme_id as string by @pwschuurman in #1761 + +### Feature +- Add ControllerModifyVolume functionality by @travisyx in #1801 + +### Uncategorized +- Add verify-docker-deps.sh to verify-all.sh by @pwschuurman in #1762 +- Change OPERATION_CANCELED_BY_USER to Canceled instead of Aborted by @amacaskill in #1789 +- Bump the onsi group across 1 directory with 2 updates by @dependabot in #1811 +- Upgrade sanity tests to v5.3.0 and CSI Spec to v1.10.0 by @travisyx in #1814 +- Add manual deployment instructions for Storage Pools by @amacaskill in #1817 +- Update stable rc master image to point to v1.14.2-rc1 by @amacaskill in #1806 +- Bump golang from 1.22.5 to 1.23.0 by @dependabot in #1808 +- Bump golang from 1.22.4 to 1.22.5 by @dependabot in #1778 +- prune changelog for 1.14 by @pwschuurman in #1745 +- Add back CHANGELOG, removed in #1745 by mistake by @pwschuurman in #1746 +- Add support for running tests on confidential VMs that use NVMe by @pwschuurman in #1636 From a1ed4e4635274497b2cd0d87af2417c456b59b3a Mon Sep 17 00:00:00 2001 From: Peter Schuurman Date: Tue, 26 Nov 2024 15:17:46 -0800 Subject: [PATCH 09/17] Migrate metric defer() statements to gRPC metric interceptor. This allows for more accurate error code reporting if gRPC functionality is refactored --- cmd/gce-pd-csi-driver/main.go | 4 +- pkg/gce-pd-csi-driver/controller.go | 102 ++-------- pkg/gce-pd-csi-driver/gce-pd-driver.go | 5 +- pkg/gce-pd-csi-driver/gce-pd-driver_test.go | 11 +- pkg/gce-pd-csi-driver/server.go | 23 ++- pkg/gce-pd-csi-driver/server_test.go | 212 ++++++++++++++++++++ pkg/metrics/interceptor.go | 23 +++ pkg/metrics/metadata.go | 55 +++++ pkg/metrics/metrics.go | 25 +-- pkg/metrics/metrics_test.go | 17 +- pkg/metrics/metrics_test_util.go | 23 +++ test/sanity/sanity_test.go | 2 +- 12 files changed, 379 insertions(+), 123 deletions(-) create mode 100644 pkg/gce-pd-csi-driver/server_test.go create mode 100644 pkg/metrics/interceptor.go create mode 100644 pkg/metrics/metadata.go create mode 100644 pkg/metrics/metrics_test_util.go diff --git a/cmd/gce-pd-csi-driver/main.go b/cmd/gce-pd-csi-driver/main.go index 4513a82aa..5c6e5a83a 100644 --- a/cmd/gce-pd-csi-driver/main.go +++ b/cmd/gce-pd-csi-driver/main.go @@ -143,6 +143,7 @@ func handle() { }() } + var metricsManager *metrics.MetricsManager = nil if *runControllerService && *httpEndpoint != "" { mm := metrics.NewMetricsManager() mm.InitializeHttpHandler(*httpEndpoint, *metricsPath) @@ -151,6 +152,7 @@ func handle() { if metrics.IsGKEComponentVersionAvailable() { mm.EmitGKEComponentVersion() } + metricsManager = &mm } if len(*extraVolumeLabelsStr) > 0 && !*runControllerService { @@ -261,7 +263,7 @@ func handle() { gce.WaitForOpBackoff.Steps = *waitForOpBackoffSteps gce.WaitForOpBackoff.Cap = *waitForOpBackoffCap - gceDriver.Run(*endpoint, *grpcLogCharCap, *enableOtelTracing) + gceDriver.Run(*endpoint, *grpcLogCharCap, *enableOtelTracing, metricsManager) } func notEmpty(v string) bool { diff --git a/pkg/gce-pd-csi-driver/controller.go b/pkg/gce-pd-csi-driver/controller.go index 5a47e5da9..decac55a0 100644 --- a/pkg/gce-pd-csi-driver/controller.go +++ b/pkg/gce-pd-csi-driver/controller.go @@ -21,7 +21,6 @@ import ( "math/rand" neturl "net/url" "sort" - "strconv" "strings" "time" @@ -303,12 +302,14 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre func (gceCS *GCEControllerServer) createVolumeInternal(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() + // Apply Parameters (case-insensitive). We leave validation of + // the values to the cloud provider. + params, err := gceCS.parameterProcessor().ExtractAndDefaultParameters(req.GetParameters(), gceCS.Driver.extraVolumeLabels, gceCS.Driver.extraTags) + metrics.UpdateRequestMetadataFromParams(ctx, params) + if err != nil { + return nil, status.Errorf(codes.InvalidArgument, "failed to extract parameters: %v", err.Error()) + } + // Validate arguments volumeCapabilities := req.GetVolumeCapabilities() capacityRange := req.GetCapacityRange() @@ -328,17 +329,6 @@ func (gceCS *GCEControllerServer) createVolumeInternal(ctx context.Context, req if err != nil { return nil, status.Errorf(codes.InvalidArgument, "VolumeCapabilities is invalid: %v", err.Error()) } - - // Apply Parameters (case-insensitive). We leave validation of - // the values to the cloud provider. - params, err := gceCS.parameterProcessor().ExtractAndDefaultParameters(req.GetParameters(), gceCS.Driver.extraVolumeLabels, gceCS.Driver.extraTags) - diskTypeForMetric = params.DiskType - enableConfidentialCompute = strconv.FormatBool(params.EnableConfidentialCompute) - hasStoragePools := len(params.StoragePools) > 0 - enableStoragePools = strconv.FormatBool(hasStoragePools) - if err != nil { - return nil, status.Errorf(codes.InvalidArgument, "failed to extract parameters: %v", err.Error()) - } // https://github.com/container-storage-interface/spec/blob/master/spec.md#createvolume // mutable_parameters MUST take precedence over the values from parameters. mutableParams := req.GetMutableParameters() @@ -782,14 +772,6 @@ func (gceCS *GCEControllerServer) ControllerModifyVolume(ctx context.Context, re return nil, status.Error(codes.InvalidArgument, "volume ID must be provided") } - diskType := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("ControllerModifyVolume", err, diskType, enableConfidentialCompute, enableStoragePools) - }() - project, volKey, err := common.VolumeIDToKey(volumeID) if err != nil { // Cannot find volume associated with this ID because VolumeID is not in the correct format @@ -806,6 +788,7 @@ func (gceCS *GCEControllerServer) ControllerModifyVolume(ctx context.Context, re klog.V(4).Infof("Modify Volume Parameters for %s: %v", volumeID, volumeModifyParams) existingDisk, err := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionBeta) + metrics.UpdateRequestMetadataFromDisk(ctx, existingDisk) if err != nil { err = fmt.Errorf("Failed to get volume: %w", err) @@ -816,9 +799,9 @@ func (gceCS *GCEControllerServer) ControllerModifyVolume(ctx context.Context, re err = status.Errorf(codes.Internal, "failed to get volume : %s", volumeID) return nil, err } - diskType = existingDisk.GetPDType() // Check if the disk supports dynamic IOPS/Throughput provisioning + diskType := existingDisk.GetPDType() supportsIopsChange := gceCS.diskSupportsIopsChange(diskType) supportsThroughputChange := gceCS.diskSupportsThroughputChange(diskType) if !supportsIopsChange && !supportsThroughputChange { @@ -834,8 +817,6 @@ func (gceCS *GCEControllerServer) ControllerModifyVolume(ctx context.Context, re return nil, err } - enableStoragePools = strconv.FormatBool(existingDisk.GetEnableStoragePools()) - err = gceCS.CloudProvider.UpdateDisk(ctx, project, volKey, existingDisk, volumeModifyParams) if err != nil { klog.Errorf("Failed to modify volume %s: %v", volumeID, err) @@ -883,12 +864,6 @@ func getGCEApiVersion(multiWriter bool) gce.GCEAPIVersion { func (gceCS *GCEControllerServer) deleteMultiZoneDisk(ctx context.Context, req *csi.DeleteVolumeRequest, project string, volKey *meta.Key) (*csi.DeleteVolumeResponse, error) { // List disks with same name var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("DeleteVolume", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() existingZones := []string{gceCS.CloudProvider.GetDefaultZone()} zones, err := getDefaultZonesInRegion(ctx, gceCS, existingZones) if err != nil { @@ -910,7 +885,7 @@ func (gceCS *GCEControllerServer) deleteMultiZoneDisk(ctx context.Context, req * } disk, _ := gceCS.CloudProvider.GetDisk(ctx, project, zonalVolKey, gce.GCEAPIVersionV1) // TODO: Consolidate the parameters here, rather than taking the last. - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(disk) + metrics.UpdateRequestMetadataFromDisk(ctx, disk) err := gceCS.CloudProvider.DeleteDisk(ctx, project, zonalVolKey) if err != nil { deleteDiskErrs = append(deleteDiskErrs, gceCS.CloudProvider.DeleteDisk(ctx, project, volKey)) @@ -927,12 +902,6 @@ func (gceCS *GCEControllerServer) deleteMultiZoneDisk(ctx context.Context, req * func (gceCS *GCEControllerServer) deleteSingleDeviceDisk(ctx context.Context, req *csi.DeleteVolumeRequest, project string, volKey *meta.Key) (*csi.DeleteVolumeResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("DeleteVolume", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() volumeID := req.GetVolumeId() project, volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, project, volKey) if err != nil { @@ -948,7 +917,7 @@ func (gceCS *GCEControllerServer) deleteSingleDeviceDisk(ctx context.Context, re } defer gceCS.volumeLocks.Release(volumeID) disk, _ := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1) - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(disk) + metrics.UpdateRequestMetadataFromDisk(ctx, disk) err = gceCS.CloudProvider.DeleteDisk(ctx, project, volKey) if err != nil { return nil, common.LoggedError("Failed to delete disk: ", err) @@ -960,12 +929,6 @@ func (gceCS *GCEControllerServer) deleteSingleDeviceDisk(ctx context.Context, re func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("ControllerPublishVolume", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() // Only valid requests will be accepted _, _, _, err = gceCS.validateControllerPublishVolumeRequest(ctx, req) if err != nil { @@ -978,7 +941,7 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r } resp, err, disk := gceCS.executeControllerPublishVolume(ctx, req) - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(disk) + metrics.UpdateRequestMetadataFromDisk(ctx, disk) if err != nil { klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err) gceCS.errorBackoff.next(backoffId, common.CodeForError(err)) @@ -1192,12 +1155,6 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("ControllerUnpublishVolume", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() _, _, err = gceCS.validateControllerUnpublishVolumeRequest(ctx, req) if err != nil { return nil, err @@ -1209,7 +1166,7 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context, return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId) } resp, err, disk := gceCS.executeControllerUnpublishVolume(ctx, req) - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(disk) + metrics.UpdateRequestMetadataFromDisk(ctx, disk) if err != nil { klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err) gceCS.errorBackoff.next(backoffId, common.CodeForError(err)) @@ -1316,12 +1273,6 @@ func (gceCS *GCEControllerServer) parameterProcessor() *common.ParameterProcesso func (gceCS *GCEControllerServer) ValidateVolumeCapabilities(ctx context.Context, req *csi.ValidateVolumeCapabilitiesRequest) (*csi.ValidateVolumeCapabilitiesResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("ValidateVolumeCapabilities", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() if req.GetVolumeCapabilities() == nil || len(req.GetVolumeCapabilities()) == 0 { return nil, status.Error(codes.InvalidArgument, "Volume Capabilities must be provided") } @@ -1348,7 +1299,7 @@ func (gceCS *GCEControllerServer) ValidateVolumeCapabilities(ctx context.Context defer gceCS.volumeLocks.Release(volumeID) disk, err := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1) - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(disk) + metrics.UpdateRequestMetadataFromDisk(ctx, disk) if err != nil { if gce.IsGCENotFoundError(err) { return nil, status.Errorf(codes.NotFound, "Could not find disk %v: %v", volKey.Name, err.Error()) @@ -1564,12 +1515,6 @@ func (gceCS *GCEControllerServer) ControllerGetCapabilities(ctx context.Context, func (gceCS *GCEControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("CreateSnapshot", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() // Validate arguments volumeID := req.GetSourceVolumeId() if len(req.Name) == 0 { @@ -1595,7 +1540,7 @@ func (gceCS *GCEControllerServer) CreateSnapshot(ctx context.Context, req *csi.C // Check if volume exists disk, err := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1) - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(disk) + metrics.UpdateRequestMetadataFromDisk(ctx, disk) if err != nil { if gce.IsGCENotFoundError(err) { return nil, status.Errorf(codes.NotFound, "CreateSnapshot could not find disk %v: %v", volKey.String(), err.Error()) @@ -1823,12 +1768,6 @@ func isCSISnapshotReady(status string) (bool, error) { func (gceCS *GCEControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error) { var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("DeleteSnapshot", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() // Validate arguments snapshotID := req.GetSnapshotId() if len(snapshotID) == 0 { @@ -1913,14 +1852,7 @@ func (gceCS *GCEControllerServer) ListSnapshots(ctx context.Context, req *csi.Li } func (gceCS *GCEControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error) { - var err error - diskTypeForMetric := metrics.DefaultDiskTypeForMetric - enableConfidentialCompute := metrics.DefaultEnableConfidentialCompute - enableStoragePools := metrics.DefaultEnableStoragePools - defer func() { - gceCS.Metrics.RecordOperationErrorMetrics("ControllerExpandVolume", err, diskTypeForMetric, enableConfidentialCompute, enableStoragePools) - }() volumeID := req.GetVolumeId() if len(volumeID) == 0 { return nil, status.Error(codes.InvalidArgument, "ControllerExpandVolume volume ID must be provided") @@ -1950,7 +1882,7 @@ func (gceCS *GCEControllerServer) ControllerExpandVolume(ctx context.Context, re } sourceDisk, err := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1) - diskTypeForMetric, enableConfidentialCompute, enableStoragePools = metrics.GetMetricParameters(sourceDisk) + metrics.UpdateRequestMetadataFromDisk(ctx, sourceDisk) resizedGb, err := gceCS.CloudProvider.ResizeDisk(ctx, project, volKey, reqBytes) if err != nil { diff --git a/pkg/gce-pd-csi-driver/gce-pd-driver.go b/pkg/gce-pd-csi-driver/gce-pd-driver.go index b41ce6e3d..8bc26ae74 100644 --- a/pkg/gce-pd-csi-driver/gce-pd-driver.go +++ b/pkg/gce-pd-csi-driver/gce-pd-driver.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils" gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute" metadataservice "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/metadata" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager" ) @@ -170,12 +171,12 @@ func NewControllerServer(gceDriver *GCEDriver, cloudProvider gce.GCECompute, err } } -func (gceDriver *GCEDriver) Run(endpoint string, grpcLogCharCap int, enableOtelTracing bool) { +func (gceDriver *GCEDriver) Run(endpoint string, grpcLogCharCap int, enableOtelTracing bool, metricsManager *metrics.MetricsManager) { maxLogChar = grpcLogCharCap klog.V(4).Infof("Driver: %v", gceDriver.name) //Start the nonblocking GRPC - s := NewNonBlockingGRPCServer(enableOtelTracing) + s := NewNonBlockingGRPCServer(enableOtelTracing, metricsManager) // TODO(#34): Only start specific servers based on a flag. // In the future have this only run specific combinations of servers depending on which version this is. // The schema for that was in util. basically it was just s.start but with some nil servers. diff --git a/pkg/gce-pd-csi-driver/gce-pd-driver_test.go b/pkg/gce-pd-csi-driver/gce-pd-driver_test.go index 7e8138207..628419526 100644 --- a/pkg/gce-pd-csi-driver/gce-pd-driver_test.go +++ b/pkg/gce-pd-csi-driver/gce-pd-driver_test.go @@ -41,8 +41,7 @@ func initBlockingGCEDriver(t *testing.T, cloudDisks []*gce.CloudDisk, readyToExe return initGCEDriverWithCloudProvider(t, fakeBlockingBlockProvider) } -func initGCEDriverWithCloudProvider(t *testing.T, cloudProvider gce.GCECompute) *GCEDriver { - vendorVersion := "test-vendor" +func controllerServerForTest(cloudProvider gce.GCECompute) *GCEControllerServer { gceDriver := GetGCEDriver() errorBackoffInitialDuration := 200 * time.Millisecond errorBackoffMaxDuration := 5 * time.Minute @@ -55,7 +54,13 @@ func initGCEDriverWithCloudProvider(t *testing.T, cloudProvider gce.GCECompute) SupportsThroughputChange: []string{"hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-ml"}, } - controllerServer := NewControllerServer(gceDriver, cloudProvider, errorBackoffInitialDuration, errorBackoffMaxDuration, fallbackRequisiteZones, enableStoragePools, multiZoneVolumeHandleConfig, listVolumesConfig, provisionableDisksConfig) + return NewControllerServer(gceDriver, cloudProvider, errorBackoffInitialDuration, errorBackoffMaxDuration, fallbackRequisiteZones, enableStoragePools, multiZoneVolumeHandleConfig, listVolumesConfig, provisionableDisksConfig) +} + +func initGCEDriverWithCloudProvider(t *testing.T, cloudProvider gce.GCECompute) *GCEDriver { + vendorVersion := "test-vendor" + gceDriver := GetGCEDriver() + controllerServer := controllerServerForTest(cloudProvider) err := gceDriver.SetupGCEDriver(driver, vendorVersion, nil, nil, nil, controllerServer, nil) if err != nil { t.Fatalf("Failed to setup GCE Driver: %v", err) diff --git a/pkg/gce-pd-csi-driver/server.go b/pkg/gce-pd-csi-driver/server.go index 8dd212e7b..573d66b4f 100644 --- a/pkg/gce-pd-csi-driver/server.go +++ b/pkg/gce-pd-csi-driver/server.go @@ -24,6 +24,7 @@ import ( "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "k8s.io/klog/v2" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" csi "github.com/container-storage-interface/spec/lib/go/csi" ) @@ -40,15 +41,16 @@ type NonBlockingGRPCServer interface { ForceStop() } -func NewNonBlockingGRPCServer(enableOtelTracing bool) NonBlockingGRPCServer { - return &nonBlockingGRPCServer{otelTracing: enableOtelTracing} +func NewNonBlockingGRPCServer(enableOtelTracing bool, metricsManager *metrics.MetricsManager) NonBlockingGRPCServer { + return &nonBlockingGRPCServer{otelTracing: enableOtelTracing, metricsManager: metricsManager} } // NonBlocking server type nonBlockingGRPCServer struct { - wg sync.WaitGroup - server *grpc.Server - otelTracing bool + wg sync.WaitGroup + server *grpc.Server + otelTracing bool + metricsManager *metrics.MetricsManager } func (s *nonBlockingGRPCServer) Start(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer) { @@ -73,10 +75,17 @@ func (s *nonBlockingGRPCServer) ForceStop() { } func (s *nonBlockingGRPCServer) serve(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer) { - grpcInterceptor := grpc.UnaryInterceptor(logGRPC) + interceptors := []grpc.UnaryServerInterceptor{logGRPC} + if s.metricsManager != nil { + metricsInterceptor := metrics.MetricInterceptor{ + MetricsManager: s.metricsManager, + } + interceptors = append(interceptors, metricsInterceptor.UnaryInterceptor()) + } if s.otelTracing { - grpcInterceptor = grpc.ChainUnaryInterceptor(logGRPC, otelgrpc.UnaryServerInterceptor()) + interceptors = append(interceptors, otelgrpc.UnaryServerInterceptor()) } + grpcInterceptor := grpc.ChainUnaryInterceptor(interceptors...) opts := []grpc.ServerOption{ grpcInterceptor, diff --git a/pkg/gce-pd-csi-driver/server_test.go b/pkg/gce-pd-csi-driver/server_test.go new file mode 100644 index 000000000..117480574 --- /dev/null +++ b/pkg/gce-pd-csi-driver/server_test.go @@ -0,0 +1,212 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package gceGCEDriver + +import ( + "context" + "fmt" + "os" + "testing" + + csipb "github.com/container-storage-interface/spec/lib/go/csi" + "google.golang.org/grpc" + + csi "github.com/container-storage-interface/spec/lib/go/csi" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" + gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" +) + +func createSocketFile() (string, func(), error) { + tmpDir, err := os.MkdirTemp("", "socket-dir") + if err != nil { + return "", nil, fmt.Errorf("failed to create temporary socket directory: %v", err) + } + cleanup := func() { + os.RemoveAll(tmpDir) + } + socketFile := fmt.Sprintf("%s/test.sock", tmpDir) + return socketFile, cleanup, nil +} + +func createServerClient(mm *metrics.MetricsManager, socketFile string, seedDisks []*gce.CloudDisk) (*grpc.ClientConn, error) { + socketEndpoint := fmt.Sprintf("unix:%s", socketFile) + file, err := os.Create(socketFile) + if err != nil { + return nil, fmt.Errorf("failed to create temporary socket file: %v", err) + } + file.Close() + + metricsPath := "/metrics" + metricEndpoint := "localhost:0" // random port + mm.InitializeHttpHandler(metricEndpoint, metricsPath) + mm.RegisterPDCSIMetric() + + server := NewNonBlockingGRPCServer(false /* enableOtelTracing */, mm) + gceDriver := GetGCEDriver() + identityServer := NewIdentityServer(gceDriver) + fakeCloudProvider, err := gce.CreateFakeCloudProvider(project, zone, seedDisks) + if err != nil { + return nil, fmt.Errorf("failed to create fake cloud provider: %v", err) + } + controllerServer := controllerServerForTest(fakeCloudProvider) + if err := gceDriver.SetupGCEDriver(driver, "test-vendor", nil, nil, identityServer, controllerServer, nil); err != nil { + return nil, fmt.Errorf("failed to setup GCE Driver: %v", err) + } + + if err != nil { + return nil, fmt.Errorf("failed to create volume %v", err) + } + server.Start(socketEndpoint, gceDriver.ids, gceDriver.cs, gceDriver.ns) + + conn, err := grpc.Dial( + socketEndpoint, + grpc.WithInsecure(), + ) + if err != nil { + return nil, fmt.Errorf("failed to create client connection") + } + return conn, nil +} + +func TestServerCreateVolumeMetric(t *testing.T) { + mm := metrics.NewMetricsManager() + mm.ResetMetrics() + socketFile, cleanup, err := createSocketFile() + if err != nil { + t.Fatalf("Failed to create socket file: %v", err) + } + defer cleanup() + conn, err := createServerClient(&mm, socketFile, nil) + if err != nil { + t.Fatalf("Failed to create server client: %v", err) + } + controllerClient := csipb.NewControllerClient(conn) + req := &csi.CreateVolumeRequest{ + Name: name, + CapacityRange: stdCapRange, + VolumeCapabilities: stdVolCaps, + Parameters: map[string]string{ + common.ParameterKeyType: "pd-balanced", + }, + } + resp, err := controllerClient.CreateVolume(context.Background(), req) + + if err != nil { + t.Fatalf("CreateVolume returned unexpected error: %v", err) + } + + wantName := "projects/test-project/zones/country-region-zone/disks/test-name" + if resp.Volume.GetVolumeId() != wantName { + t.Fatalf("Response name expected: %v, got: %v", wantName, resp.Volume.GetVolumeId()) + } + + reg := mm.GetRegistry() + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gailed to gather metrics: %v", err) + } + if len(metrics) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(metrics)) + } + gotMetric := fmt.Sprint(metrics[0]) + wantMetric := `name:"csidriver_operation_errors" help:"[ALPHA] CSI server side error metrics" type:COUNTER metric: label: label: label: label: label: counter: > ` + if gotMetric != wantMetric { + t.Fatalf("Metric mismatch: \ngot: %v\nwant: %v", gotMetric, wantMetric) + } +} + +func TestServerValidateVolumeCapabilitiesMetric(t *testing.T) { + mm := metrics.NewMetricsManager() + mm.ResetMetrics() + seedDisks := []*gce.CloudDisk{ + createZonalCloudDisk(name), + } + socketFile, cleanup, err := createSocketFile() + if err != nil { + t.Fatalf("Failed to create socket file: %v", err) + } + defer cleanup() + conn, err := createServerClient(&mm, socketFile, seedDisks) + if err != nil { + t.Fatalf("Failed to create server client: %v", err) + } + controllerClient := csipb.NewControllerClient(conn) + req := &csi.ValidateVolumeCapabilitiesRequest{ + VolumeId: fmt.Sprintf("projects/%s/zones/%s/disks/%s", project, zone, name), + VolumeCapabilities: stdVolCaps, + } + resp, err := controllerClient.ValidateVolumeCapabilities(context.Background(), req) + + if err != nil { + t.Fatalf("CreateVolume returned unexpected error: %v", err) + } + + if resp.Confirmed != nil { + t.Fatalf("Expected not nil response, got: %v", resp) + } + + reg := mm.GetRegistry() + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gailed to gather metrics: %v", err) + } + if len(metrics) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(metrics)) + } + gotMetric := fmt.Sprint(metrics[0]) + wantMetric := `name:"csidriver_operation_errors" help:"[ALPHA] CSI server side error metrics" type:COUNTER metric: label: label: label: label: label: counter: > ` + if gotMetric != wantMetric { + t.Fatalf("Metric mismatch: \ngot: %v\nwant: %v", gotMetric, wantMetric) + } +} + +func TestServerGetPluginInfoMetric(t *testing.T) { + mm := metrics.NewMetricsManager() + mm.ResetMetrics() + socketFile, cleanup, err := createSocketFile() + if err != nil { + t.Fatalf("Failed to create socket file: %v", err) + } + defer cleanup() + conn, err := createServerClient(&mm, socketFile, nil) + if err != nil { + t.Fatalf("Failed to create server client: %v", err) + } + idClient := csipb.NewIdentityClient(conn) + resp, err := idClient.GetPluginInfo(context.Background(), &csi.GetPluginInfoRequest{}) + if err != nil { + t.Fatalf("GetPluginInfo returned unexpected error: %v", err) + } + + if resp.GetName() != driver { + t.Fatalf("Response name expected: %v, got: %v", driver, resp.GetName()) + } + + reg := mm.GetRegistry() + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gailed to gather metrics: %v", err) + } + if len(metrics) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(metrics)) + } + gotMetric := fmt.Sprint(metrics[0]) + wantMetric := `name:"csidriver_operation_errors" help:"[ALPHA] CSI server side error metrics" type:COUNTER metric: label: label: label: label: label: counter: > ` + if gotMetric != wantMetric { + t.Fatalf("Metric mismatch: \ngot: %v\nwant: %v", gotMetric, wantMetric) + } +} diff --git a/pkg/metrics/interceptor.go b/pkg/metrics/interceptor.go new file mode 100644 index 000000000..57269e713 --- /dev/null +++ b/pkg/metrics/interceptor.go @@ -0,0 +1,23 @@ +package metrics + +import ( + "context" + + "google.golang.org/grpc" +) + +type MetricInterceptor struct { + MetricsManager *MetricsManager +} + +func (m *MetricInterceptor) unaryInterceptorInternal(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) { + requestMetadata := newRequestMetadata() + newCtx := context.WithValue(ctx, requestMetadataKey, requestMetadata) + result, err := handler(newCtx, req) + m.MetricsManager.RecordOperationErrorMetrics(info.FullMethod, err, requestMetadata.diskType, requestMetadata.enableConfidentialStorage, requestMetadata.enableStoragePools) + return result, err +} + +func (m *MetricInterceptor) UnaryInterceptor() grpc.UnaryServerInterceptor { + return m.unaryInterceptorInternal +} diff --git a/pkg/metrics/metadata.go b/pkg/metrics/metadata.go new file mode 100644 index 000000000..a2e41e81a --- /dev/null +++ b/pkg/metrics/metadata.go @@ -0,0 +1,55 @@ +package metrics + +import ( + "context" + "strconv" + + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" + gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute" +) + +const ( + // envGKEPDCSIVersion is an environment variable set in the PDCSI controller manifest + // with the current version of the GKE component. + requestMetadataKey = "requestMetadata" +) + +// RequestMetadata represents metadata about a gRPC CSI request +type RequestMetadata struct { + diskType string + enableConfidentialStorage string + enableStoragePools string +} + +func newRequestMetadata() *RequestMetadata { + return &RequestMetadata{ + diskType: DefaultDiskTypeForMetric, + enableConfidentialStorage: DefaultEnableConfidentialCompute, + enableStoragePools: DefaultEnableStoragePools, + } +} + +// MetadataFromContext returns a mutable from a request context +func MetadataFromContext(ctx context.Context) *RequestMetadata { + requestMetadata, _ := ctx.Value(requestMetadataKey).(*RequestMetadata) + return requestMetadata +} + +func UpdateRequestMetadataFromParams(ctx context.Context, params common.DiskParameters) { + metadata := MetadataFromContext(ctx) + if metadata != nil { + metadata.diskType = params.DiskType + metadata.enableConfidentialStorage = strconv.FormatBool(params.EnableConfidentialCompute) + hasStoragePools := len(params.StoragePools) > 0 + metadata.enableStoragePools = strconv.FormatBool(hasStoragePools) + } +} + +func UpdateRequestMetadataFromDisk(ctx context.Context, disk *gce.CloudDisk) { + metadata := MetadataFromContext(ctx) + if metadata != nil && disk != nil { + metadata.diskType = disk.GetPDType() + metadata.enableConfidentialStorage = strconv.FormatBool(disk.GetEnableConfidentialCompute()) + metadata.enableStoragePools = strconv.FormatBool(disk.GetEnableStoragePools()) + } +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 8a998f089..52955fcbb 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -21,13 +21,11 @@ import ( "fmt" "net/http" "os" - "strconv" "google.golang.org/grpc/codes" "k8s.io/component-base/metrics" "k8s.io/klog/v2" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" - gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute" ) const ( @@ -41,6 +39,11 @@ const ( ) var ( + gkeComponentVersion *metrics.GaugeVec + pdcsiOperationErrorsMetric *metrics.CounterVec +) + +func initMetrics() { // This metric is exposed only from the controller driver component when GKE_PDCSI_VERSION env variable is set. gkeComponentVersion = metrics.NewGaugeVec(&metrics.GaugeOpts{ Name: "component_version", @@ -55,7 +58,7 @@ var ( StabilityLevel: metrics.ALPHA, }, []string{"driver_name", "method_name", "grpc_status_code", "disk_type", "enable_confidential_storage", "enable_storage_pools"}) -) +} type MetricsManager struct { registry metrics.KubeRegistry @@ -93,13 +96,13 @@ func (mm *MetricsManager) recordComponentVersionMetric() error { } func (mm *MetricsManager) RecordOperationErrorMetrics( - operationName string, + fullMethodName string, operationErr error, diskType string, enableConfidentialStorage string, enableStoragePools string) { errCode := errorCodeLabelValue(operationErr) - pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc() + pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, fullMethodName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc() klog.Infof("Recorded PDCSI operation error code: %q", errCode) } @@ -157,18 +160,6 @@ func IsGKEComponentVersionAvailable() bool { return true } -func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) { - diskType := DefaultDiskTypeForMetric - enableConfidentialStorage := DefaultEnableConfidentialCompute - enableStoragePools := DefaultEnableStoragePools - if disk != nil { - diskType = disk.GetPDType() - enableConfidentialStorage = strconv.FormatBool(disk.GetEnableConfidentialCompute()) - enableStoragePools = strconv.FormatBool(disk.GetEnableStoragePools()) - } - return diskType, enableConfidentialStorage, enableStoragePools -} - // errorCodeLabelValue returns the label value for the given operation error. // This was separated into a helper function for unit testing purposes. func errorCodeLabelValue(operationErr error) string { diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 62499a575..8ef80cac3 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -99,15 +99,18 @@ func TestGetMetricParameters(t *testing.T) { for _, tc := range testCases { t.Logf("Running test: %v", tc.name) - diskType, confidentialCompute, enableStoragePools := GetMetricParameters(tc.disk) - if confidentialCompute != tc.expectedEnableConfidentialCompute { - t.Fatalf("Got confidentialCompute value %q expected %q", confidentialCompute, tc.expectedEnableConfidentialCompute) + ctx := context.TODO() + requestMetadata := newRequestMetadata() + newCtx := context.WithValue(ctx, requestMetadataKey, requestMetadata) + UpdateRequestMetadataFromDisk(newCtx, tc.disk) + if requestMetadata.enableConfidentialStorage != tc.expectedEnableConfidentialCompute { + t.Fatalf("Got confidentialCompute value %q expected %q", requestMetadata.enableConfidentialStorage, tc.expectedEnableConfidentialCompute) } - if diskType != tc.expectedDiskType { - t.Fatalf("Got diskType value %q expected %q", diskType, tc.expectedDiskType) + if requestMetadata.diskType != tc.expectedDiskType { + t.Fatalf("Got diskType value %q expected %q", requestMetadata.enableConfidentialStorage, tc.expectedDiskType) } - if enableStoragePools != tc.expectedEnableStoragePools { - t.Fatalf("Got enableStoragePools value %q expected %q", enableStoragePools, tc.expectedEnableStoragePools) + if requestMetadata.enableStoragePools != tc.expectedEnableStoragePools { + t.Fatalf("Got enableStoragePools value %q expected %q", requestMetadata.enableStoragePools, tc.expectedEnableStoragePools) } } } diff --git a/pkg/metrics/metrics_test_util.go b/pkg/metrics/metrics_test_util.go new file mode 100644 index 000000000..c77142413 --- /dev/null +++ b/pkg/metrics/metrics_test_util.go @@ -0,0 +1,23 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +// Test-only method used for resetting metric counts. +func (mm *MetricsManager) ResetMetrics() { + // Re-initialize metrics + initMetrics() +} diff --git a/test/sanity/sanity_test.go b/test/sanity/sanity_test.go index 8086eb251..c6a2dad22 100644 --- a/test/sanity/sanity_test.go +++ b/test/sanity/sanity_test.go @@ -102,7 +102,7 @@ func TestSanity(t *testing.T) { }() go func() { - gceDriver.Run(endpoint, 10000, false) + gceDriver.Run(endpoint, 10000, false /* enableOtelTracing */, nil /* metricsManager */) }() // TODO(#818): Fix failing tests and remove test skip flag. From 0862fd6a13bb8e6a50cf501940740b1c5e6b76f0 Mon Sep 17 00:00:00 2001 From: Peter Schuurman Date: Fri, 6 Dec 2024 14:38:57 -0800 Subject: [PATCH 10/17] Don't overwrite libc in distroless debian base image --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d439ed960..47f6313dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -84,7 +84,6 @@ COPY --from=debian /lib/${LIB_DIR_PREFIX}-linux-gnu/libselinux.so.1 \ /lib/${LIB_DIR_PREFIX}-linux-gnu/liblzma.so.5 \ /lib/${LIB_DIR_PREFIX}-linux-gnu/libreadline.so.8 \ /lib/${LIB_DIR_PREFIX}-linux-gnu/libz.so.1 \ - /lib/${LIB_DIR_PREFIX}-linux-gnu/libc.so.6 \ /lib/${LIB_DIR_PREFIX}-linux-gnu/liburcu.so.8 \ /lib/${LIB_DIR_PREFIX}-linux-gnu/libcap.so.2 \ /lib/${LIB_DIR_PREFIX}-linux-gnu/libcrypto.so.3 \ From 524895bd38cecc1817e51c09d99b6f613a68f706 Mon Sep 17 00:00:00 2001 From: Saikat Roychowdhury Date: Mon, 9 Dec 2024 19:23:13 +0000 Subject: [PATCH 11/17] update prow rc with 1.15.3-rc1 release candidate --- .../kubernetes/images/prow-stable-sidecar-rc-master/image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml b/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml index 55c27b516..cff08f552 100644 --- a/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml +++ b/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml @@ -48,6 +48,6 @@ metadata: imageTag: name: registry.k8s.io/cloud-provider-gcp/gcp-compute-persistent-disk-csi-driver newName: gcr.io/k8s-staging-cloud-provider-gcp/gcp-compute-persistent-disk-csi-driver - newTag: "v1.15.2-rc3" + newTag: "v1.15.3-rc1" --- From 2e9c71c05904b39d125523611f0aa0545e02e603 Mon Sep 17 00:00:00 2001 From: Saikat Roychowdhury Date: Tue, 10 Dec 2024 22:46:46 +0000 Subject: [PATCH 12/17] Make the volume attribute class file a configurable input The volume attribute class tests are valid for 1.31+ cluster By making the param configurable in the run-k8s-integration-*.sh we can disable the flag in k8s clusters lesser than intended minor version --- test/run-k8s-integration.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/run-k8s-integration.sh b/test/run-k8s-integration.sh index c57ac0dcc..158d1e489 100755 --- a/test/run-k8s-integration.sh +++ b/test/run-k8s-integration.sh @@ -28,6 +28,7 @@ readonly teardown_driver=${GCE_PD_TEARDOWN_DRIVER:-true} readonly gke_node_version=${GKE_NODE_VERSION:-} readonly use_kubetest2=${USE_KUBETEST2:-true} readonly migration_test=${MIGRATION_TEST:-false} +readonly volumeattributesclass_files=${VOLUME_ATTRIBUTES_CLASS_FILES:-hdb-volumeattributesclass.yaml} export GCE_PD_VERBOSITY=9 @@ -48,12 +49,15 @@ base_cmd="${PKGDIR}/bin/k8s-integration-test \ --do-driver-build=${do_driver_build} --teardown-driver=${teardown_driver} \ --do-k8s-build=${do_k8s_build} --boskos-resource-type=${boskos_resource_type} \ --storageclass-files=sc-balanced.yaml --snapshotclass-files=pd-volumesnapshotclass.yaml \ - --volumeattributesclass-files=hdb-volumeattributesclass.yaml \ --storageclass-for-vac-file=sc-hdb.yaml \ --kube-runtime-config=api/all=true \ --deployment-strategy=${deployment_strategy} --test-version=${test_version} \ --num-nodes=3 --image-type=${image_type} --use-kubetest2=${use_kubetest2}" +if [[ -n "${volumeattributesclass_files}" ]]; then + base_cmd+=" --volumeattributesclass-files=${volumeattributesclass_files}" +fi + if [ "$use_gke_managed_driver" = false ]; then base_cmd="${base_cmd} --deploy-overlay-name=${overlay_name}" else From 4f2fd638667ab52b5261ce8c257878a23bbd70e7 Mon Sep 17 00:00:00 2001 From: Alexis MacAskill Date: Thu, 12 Dec 2024 22:37:09 +0000 Subject: [PATCH 13/17] skip xfs test for GCE test skip --- test/k8s-integration/main.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/k8s-integration/main.go b/test/k8s-integration/main.go index 092526a8b..57b25f37e 100644 --- a/test/k8s-integration/main.go +++ b/test/k8s-integration/main.go @@ -635,6 +635,9 @@ func handle() error { func generateGCETestSkip(testParams *testParameters) string { skipString := "\\[Disruptive\\]|\\[Serial\\]" + // Skip mount options test until we fix the invalid mount options for xfs. + skipString = skipString + "|csi-gcepd-sc-xfs.*provisioning.should.provision.storage.with.mount.options" + v := apimachineryversion.MustParseSemantic(testParams.clusterVersion) // "volumeMode should not mount / map unused volumes in a pod" tests a From 96338cae8f2a0f04d68f21af525556fb1a5fcc5e Mon Sep 17 00:00:00 2001 From: Saikat Roychowdhury Date: Thu, 12 Dec 2024 20:04:12 -0800 Subject: [PATCH 14/17] create new rc for 1.15.3 (#1893) Make a new release candidate with 1.15.3 --- .../kubernetes/images/prow-stable-sidecar-rc-master/image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml b/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml index cff08f552..1582fcd9e 100644 --- a/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml +++ b/deploy/kubernetes/images/prow-stable-sidecar-rc-master/image.yaml @@ -48,6 +48,6 @@ metadata: imageTag: name: registry.k8s.io/cloud-provider-gcp/gcp-compute-persistent-disk-csi-driver newName: gcr.io/k8s-staging-cloud-provider-gcp/gcp-compute-persistent-disk-csi-driver - newTag: "v1.15.3-rc1" + newTag: "v1.15.3-rc2" --- From 7407af0005f222a7204a9c9cd3775ac0f57b1ad1 Mon Sep 17 00:00:00 2001 From: Baptiste Girard-Carrabin Date: Wed, 18 Dec 2024 14:33:02 +0100 Subject: [PATCH 15/17] [metrics] Fix panic during metrics manager startup This fixes a regression introduced in #1876 where the driver would start panicking on startup if `--http-endpoint` was specified. This was caused by the metrics not being initialized anymore during startup. The proposed fix involves using the `Reset` methods of the metrics object instead of trying to redefine them each time they need to be reset. --- pkg/metrics/metrics.go | 7 +------ pkg/metrics/metrics_test_util.go | 3 ++- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 52955fcbb..600a1b2f0 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -39,11 +39,6 @@ const ( ) var ( - gkeComponentVersion *metrics.GaugeVec - pdcsiOperationErrorsMetric *metrics.CounterVec -) - -func initMetrics() { // This metric is exposed only from the controller driver component when GKE_PDCSI_VERSION env variable is set. gkeComponentVersion = metrics.NewGaugeVec(&metrics.GaugeOpts{ Name: "component_version", @@ -58,7 +53,7 @@ func initMetrics() { StabilityLevel: metrics.ALPHA, }, []string{"driver_name", "method_name", "grpc_status_code", "disk_type", "enable_confidential_storage", "enable_storage_pools"}) -} +) type MetricsManager struct { registry metrics.KubeRegistry diff --git a/pkg/metrics/metrics_test_util.go b/pkg/metrics/metrics_test_util.go index c77142413..ce0d13d95 100644 --- a/pkg/metrics/metrics_test_util.go +++ b/pkg/metrics/metrics_test_util.go @@ -19,5 +19,6 @@ package metrics // Test-only method used for resetting metric counts. func (mm *MetricsManager) ResetMetrics() { // Re-initialize metrics - initMetrics() + gkeComponentVersion.Reset() + pdcsiOperationErrorsMetric.Reset() } From 0f74882156c062bd31503c5f04ea66eec2344fd4 Mon Sep 17 00:00:00 2001 From: Matthew Cary Date: Thu, 2 Jan 2025 10:20:44 -0800 Subject: [PATCH 16/17] Use correct path in error message for udev tooling --- pkg/deviceutils/device-utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/deviceutils/device-utils.go b/pkg/deviceutils/device-utils.go index 75ecabbcb..24a6e8c93 100644 --- a/pkg/deviceutils/device-utils.go +++ b/pkg/deviceutils/device-utils.go @@ -194,7 +194,7 @@ func ensureUdevToolExists(toolPath string) error { if !exists { // The driver should be containerized with the tool so maybe something is // wrong with the build process - return fmt.Errorf("could not find tool at %q, unable to verify device paths", nvmeIdPath) + return fmt.Errorf("could not find tool at %q, unable to verify device paths", toolPath) } return nil } From 37e19fde81601311aea3f9b3f04c10fa85d2352c Mon Sep 17 00:00:00 2001 From: karkunpavan Date: Thu, 9 Jan 2025 13:14:29 +0000 Subject: [PATCH 17/17] adds the changes to support hyperdisk multi-writer mode and updates to e2e tests --- pkg/common/parameters.go | 4 ++++ pkg/gce-cloud-provider/compute/cloud-disk.go | 2 ++ pkg/gce-cloud-provider/compute/gce-compute.go | 20 ------------------- test/e2e/tests/single_zone_e2e_test.go | 11 +++++----- 4 files changed, 11 insertions(+), 26 deletions(-) diff --git a/pkg/common/parameters.go b/pkg/common/parameters.go index 872837193..837b52cee 100644 --- a/pkg/common/parameters.go +++ b/pkg/common/parameters.go @@ -106,6 +106,9 @@ type DiskParameters struct { // Values: {bool} // Default: false MultiZoneProvisioning bool + // Values: READ_WRITE_SINGLE, READ_ONLY_MANY, READ_WRITE_MANY + // Default: READ_WRITE_SINGLE + AccessMode string } // SnapshotParameters contains normalized and defaulted parameters for snapshots @@ -148,6 +151,7 @@ func (pp *ParameterProcessor) ExtractAndDefaultParameters(parameters map[string] Tags: make(map[string]string), // Default Labels: make(map[string]string), // Default ResourceTags: make(map[string]string), // Default + AccessMode: "READ_WRITE_SINGLE", // Default } for k, v := range extraVolumeLabels { diff --git a/pkg/gce-cloud-provider/compute/cloud-disk.go b/pkg/gce-cloud-provider/compute/cloud-disk.go index 6790992c4..71ed6490b 100644 --- a/pkg/gce-cloud-provider/compute/cloud-disk.go +++ b/pkg/gce-cloud-provider/compute/cloud-disk.go @@ -217,6 +217,8 @@ func (d *CloudDisk) GetMultiWriter() bool { switch { case d.disk != nil: return false + case d.disk != nil && d.disk.AccessMode == "READ_WRITE_MANY": + return true case d.betaDisk != nil: return d.betaDisk.MultiWriter default: diff --git a/pkg/gce-cloud-provider/compute/gce-compute.go b/pkg/gce-cloud-provider/compute/gce-compute.go index c2c2b7603..0ec1cc764 100644 --- a/pkg/gce-cloud-provider/compute/gce-compute.go +++ b/pkg/gce-cloud-provider/compute/gce-compute.go @@ -324,8 +324,6 @@ func (cloud *CloudProvider) ListSnapshots(ctx context.Context, filter string) ([ func (cloud *CloudProvider) GetDisk(ctx context.Context, project string, key *meta.Key, gceAPIVersion GCEAPIVersion) (*CloudDisk, error) { klog.V(5).Infof("Getting disk %v", key) - // Override GCEAPIVersion as hyperdisk is only available in beta and we cannot get the disk-type with get disk call. - gceAPIVersion = GCEAPIVersionBeta switch key.Type() { case meta.Zonal: if gceAPIVersion == GCEAPIVersionBeta { @@ -407,11 +405,6 @@ func (cloud *CloudProvider) ValidateExistingDisk(ctx context.Context, resp *Clou reqBytes, common.GbToBytes(resp.GetSizeGb()), limBytes) } - // We are assuming here that a multiWriter disk could be used as non-multiWriter - if multiWriter && !resp.GetMultiWriter() { - return fmt.Errorf("disk already exists with incompatible capability. Need MultiWriter. Got non-MultiWriter") - } - return ValidateDiskParameters(resp, params) } @@ -553,9 +546,6 @@ func convertV1DiskToBetaDisk(v1Disk *computev1.Disk) *computebeta.Disk { AccessMode: v1Disk.AccessMode, } - // Hyperdisk doesn't currently support multiWriter (https://cloud.google.com/compute/docs/disks/hyperdisks#limitations), - // but if multiWriter + hyperdisk is supported in the future, we want the PDCSI driver to support this feature without - // any additional code change. if v1Disk.ProvisionedIops > 0 { betaDisk.ProvisionedIops = v1Disk.ProvisionedIops } @@ -619,9 +609,6 @@ func convertBetaDiskToV1Disk(betaDisk *computebeta.Disk) *computev1.Disk { AccessMode: betaDisk.AccessMode, } - // Hyperdisk doesn't currently support multiWriter (https://cloud.google.com/compute/docs/disks/hyperdisks#limitations), - // but if multiWriter + hyperdisk is supported in the future, we want the PDCSI driver to support this feature without - // any additional code change. if betaDisk.ProvisionedIops > 0 { v1Disk.ProvisionedIops = betaDisk.ProvisionedIops } @@ -651,10 +638,6 @@ func (cloud *CloudProvider) insertRegionalDisk( gceAPIVersion = GCEAPIVersionV1 ) - if multiWriter { - gceAPIVersion = GCEAPIVersionBeta - } - diskToCreate := &computev1.Disk{ Name: volKey.Name, SizeGb: common.BytesToGbRoundUp(capBytes), @@ -778,9 +761,6 @@ func (cloud *CloudProvider) insertZonalDisk( opName string gceAPIVersion = GCEAPIVersionV1 ) - if multiWriter { - gceAPIVersion = GCEAPIVersionBeta - } diskToCreate := &computev1.Disk{ Name: volKey.Name, diff --git a/test/e2e/tests/single_zone_e2e_test.go b/test/e2e/tests/single_zone_e2e_test.go index c77c7b911..1e662773b 100644 --- a/test/e2e/tests/single_zone_e2e_test.go +++ b/test/e2e/tests/single_zone_e2e_test.go @@ -904,8 +904,7 @@ var _ = Describe("GCE PD CSI Driver", func() { Expect(err).To(BeNil(), "Failed to go through volume lifecycle") }) - // Pending while multi-writer feature is in Alpha - PIt("Should create and delete multi-writer disk", func() { + It("Should create and delete multi-writer disk", func() { Expect(testContexts).ToNot(BeEmpty()) testContext := getRandomTestContext() @@ -916,7 +915,7 @@ var _ = Describe("GCE PD CSI Driver", func() { zone := "us-east1-a" // Create and Validate Disk - volName, volID := createAndValidateUniqueZonalMultiWriterDisk(client, p, zone, standardDiskType) + volName, volID := createAndValidateUniqueZonalMultiWriterDisk(client, p, zone, hdbDiskType) defer func() { // Delete Disk @@ -929,8 +928,7 @@ var _ = Describe("GCE PD CSI Driver", func() { }() }) - // Pending while multi-writer feature is in Alpha - PIt("Should complete entire disk lifecycle with multi-writer disk", func() { + It("Should complete entire disk lifecycle with multi-writer disk", func() { testContext := getRandomTestContext() p, z, _ := testContext.Instance.GetIdentity() @@ -938,7 +936,7 @@ var _ = Describe("GCE PD CSI Driver", func() { instance := testContext.Instance // Create and Validate Disk - volName, volID := createAndValidateUniqueZonalMultiWriterDisk(client, p, z, standardDiskType) + volName, volID := createAndValidateUniqueZonalMultiWriterDisk(client, p, z, hdbDiskType) defer func() { // Delete Disk @@ -1710,6 +1708,7 @@ func deleteVolumeOrError(client *remote.CsiClient, volID string) { func createAndValidateUniqueZonalMultiWriterDisk(client *remote.CsiClient, project, zone string, diskType string) (string, string) { // Create Disk disk := typeToDisk[diskType] + disk.params.AccessMode = "READ_WRITE_MANY" volName := testNamePrefix + string(uuid.NewUUID()) volume, err := client.CreateVolumeWithCaps(volName, disk.params, defaultMwSizeGb, &csi.TopologyRequirement{