From e31328981f85fbd996793d5bbfc075eea44796ea Mon Sep 17 00:00:00 2001 From: Julianne DeMars Date: Mon, 21 Nov 2022 22:32:46 +0000 Subject: [PATCH 1/2] add rpc-specific backoff --- pkg/gce-pd-csi-driver/controller.go | 14 ++++++++----- pkg/gce-pd-csi-driver/controller_test.go | 26 +++++++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/pkg/gce-pd-csi-driver/controller.go b/pkg/gce-pd-csi-driver/controller.go index 0c68d4107..59d9491a1 100644 --- a/pkg/gce-pd-csi-driver/controller.go +++ b/pkg/gce-pd-csi-driver/controller.go @@ -18,6 +18,7 @@ import ( "context" "fmt" "math/rand" + "reflect" "regexp" "sort" "time" @@ -410,7 +411,9 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r return nil, err } - backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId) + backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId, reflect.TypeOf(req).String()) + klog.Errorf("BLAHBLAH %s", reflect.TypeOf(req).Name()) + if gceCS.errorBackoff.blocking(backoffId) { return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId) } @@ -517,7 +520,7 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con attached, err := diskIsAttachedAndCompatible(deviceName, instance, volumeCapability, readWrite) if err != nil { - return nil, status.Error(codes.AlreadyExists, fmt.Sprintf("Disk %v already published to node %v but incompatbile: %v", volKey.Name, nodeID, err)) + return nil, status.Error(codes.AlreadyExists, fmt.Sprintf("Disk %v already published to node %v but incompatible: %v", volKey.Name, nodeID, err)) } if attached { // Volume is attached to node. Success! @@ -548,7 +551,7 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context, return nil, err } - backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId) + backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId, reflect.TypeOf(req).Name()) if gceCS.errorBackoff.blocking(backoffId) { return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId) } @@ -1600,8 +1603,8 @@ func newCsiErrorBackoff() *csiErrorBackoff { return &csiErrorBackoff{flowcontrol.NewBackOff(errorBackoffInitialDuration, errorBackoffMaxDuration)} } -func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId { - return csiErrorBackoffId(fmt.Sprintf("%s:%s", nodeId, volumeId)) +func (_ *csiErrorBackoff) backoffId(nodeId, volumeId, reqType string) csiErrorBackoffId { + return csiErrorBackoffId(fmt.Sprintf("%s:%s:%s", nodeId, volumeId, reqType)) } func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool { @@ -1610,6 +1613,7 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool { } func (b *csiErrorBackoff) next(id csiErrorBackoffId) { + klog.Errorf("Here %s", id) b.backoff.Next(string(id), b.backoff.Clock.Now()) } diff --git a/pkg/gce-pd-csi-driver/controller_test.go b/pkg/gce-pd-csi-driver/controller_test.go index f7f68b268..f52fce5bf 100644 --- a/pkg/gce-pd-csi-driver/controller_test.go +++ b/pkg/gce-pd-csi-driver/controller_test.go @@ -2195,7 +2195,7 @@ func backoffTesterForUnpublish(t *testing.T, config *backoffTesterConfig) { errorBackoff: newFakeCsiErrorBackoff(tc), } - backoffId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID) + backoffId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID, reflect.TypeOf(csi.ControllerUnpublishVolumeRequest{}).String()) step := 1 * time.Millisecond runUnpublishRequest := func(req *csi.ControllerUnpublishVolumeRequest, reportError bool) error { @@ -2390,10 +2390,14 @@ func backoffTesterForPublish(t *testing.T, config *backoffTesterConfig) { errorBackoff: newFakeCsiErrorBackoff(tc), } - backoffId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID) + backoffUnpublishId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID, reflect.TypeOf(csi.ControllerUnpublishVolumeRequest{}).String()) step := 1 * time.Millisecond + // Mock an active backoff condition for unpublish command on the node. + driver.cs.errorBackoff.next(backoffUnpublishId) + + backoffPublishId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID, reflect.TypeOf(csi.ControllerPublishVolumeRequest{}).String()) // Mock an active backoff condition on the node. - driver.cs.errorBackoff.next(backoffId) + driver.cs.errorBackoff.next(backoffPublishId) // A detach request for a different disk should succeed. As this disk is not // on the instance, the detach will succeed without calling the gce detach @@ -2447,9 +2451,13 @@ func backoffTesterForPublish(t *testing.T, config *backoffTesterConfig) { t.Errorf("unexpected error %v", err) } - t1 := driver.cs.errorBackoff.backoff.Get(string(backoffId)) + t1 := driver.cs.errorBackoff.backoff.Get(string(backoffUnpublishId)) + if t1 == 0 { + t.Error("expected delay for unpublish backoff, got none") + } + t1 = driver.cs.errorBackoff.backoff.Get(string(backoffPublishId)) if t1 == 0 { - t.Error("expected delay, got none") + t.Error("expected delay for publish backoff, got none") } return } @@ -2478,9 +2486,13 @@ func backoffTesterForPublish(t *testing.T, config *backoffTesterConfig) { } // Driver is expected to remove the node key from the backoff map. - t1 := driver.cs.errorBackoff.backoff.Get(string(backoffId)) + t1 := driver.cs.errorBackoff.backoff.Get(string(backoffUnpublishId)) if t1 != 0 { - t.Error("unexpected delay") + t.Error("unexpected unpublish backoff delay") + } + t1 = driver.cs.errorBackoff.backoff.Get(string(backoffPublishId)) + if t1 != 0 { + t.Error("unexpected publish backoff delay") } } From 097416afc2415d268a17b34b0ccc7835c0775ee7 Mon Sep 17 00:00:00 2001 From: Julianne DeMars Date: Mon, 21 Nov 2022 23:02:11 +0000 Subject: [PATCH 2/2] rm debug lines --- pkg/gce-pd-csi-driver/controller.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/gce-pd-csi-driver/controller.go b/pkg/gce-pd-csi-driver/controller.go index 59d9491a1..ed5bb4d78 100644 --- a/pkg/gce-pd-csi-driver/controller.go +++ b/pkg/gce-pd-csi-driver/controller.go @@ -412,8 +412,6 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r } backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId, reflect.TypeOf(req).String()) - klog.Errorf("BLAHBLAH %s", reflect.TypeOf(req).Name()) - if gceCS.errorBackoff.blocking(backoffId) { return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId) } @@ -1613,7 +1611,6 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool { } func (b *csiErrorBackoff) next(id csiErrorBackoffId) { - klog.Errorf("Here %s", id) b.backoff.Next(string(id), b.backoff.Clock.Now()) }