dobsonj
diff --git a/‎pkg/gce-cloud-provider/compute/fake-gce.go
+28-3 b/‎pkg/gce-cloud-provider/compute/fake-gce.go
+28-3
diff --git a/‎pkg/gce-pd-csi-driver/controller.go
+89-93 b/‎pkg/gce-pd-csi-driver/controller.go
+89-93
@@ -504,27 +504,47 @@ func (cloud *FakeCloudProvider) UpdateDiskStatus(s string) {
 
 type FakeBlockingCloudProvider struct {
 	*FakeCloudProvider
-	ReadyToExecute chan chan struct{}
+	ReadyToExecute chan chan Signal
 }
 
 // FakeBlockingCloudProvider's method adds functionality to finely control the order of execution of CreateSnapshot calls.
 // Upon starting a CreateSnapshot, it passes a chan 'executeCreateSnapshot' into readyToExecute, then blocks on executeCreateSnapshot.
 // The test calling this function can block on readyToExecute to ensure that the operation has started and
 // allowed the CreateSnapshot to continue by passing a struct into executeCreateSnapshot.
 func (cloud *FakeBlockingCloudProvider) CreateSnapshot(ctx context.Context, project string, volKey *meta.Key, snapshotName string, snapshotParams common.SnapshotParameters) (*computev1.Snapshot, error) {
-	executeCreateSnapshot := make(chan struct{})
+	executeCreateSnapshot := make(chan Signal)
 	cloud.ReadyToExecute <- executeCreateSnapshot
 	<-executeCreateSnapshot
 	return cloud.FakeCloudProvider.CreateSnapshot(ctx, project, volKey, snapshotName, snapshotParams)
 }
 
 func (cloud *FakeBlockingCloudProvider) CreateImage(ctx context.Context, project string, volKey *meta.Key, imageName string, snapshotParams common.SnapshotParameters) (*computev1.Image, error) {
-	executeCreateSnapshot := make(chan struct{})
+	executeCreateSnapshot := make(chan Signal)
 	cloud.ReadyToExecute <- executeCreateSnapshot
 	<-executeCreateSnapshot
 	return cloud.FakeCloudProvider.CreateImage(ctx, project, volKey, imageName, snapshotParams)
 }
 
+func (cloud *FakeBlockingCloudProvider) DetachDisk(ctx context.Context, project, deviceName, instanceZone, instanceName string) error {
+	execute := make(chan Signal)
+	cloud.ReadyToExecute <- execute
+	val := <-execute
+	if val.ReportError {
+		return fmt.Errorf("force mock error for DetachDisk device %s", deviceName)
+	}
+	return cloud.FakeCloudProvider.DetachDisk(ctx, project, deviceName, instanceZone, instanceName)
+}
+
+func (cloud *FakeBlockingCloudProvider) AttachDisk(ctx context.Context, project string, volKey *meta.Key, readWrite, diskType, instanceZone, instanceName string) error {
+	execute := make(chan Signal)
+	cloud.ReadyToExecute <- execute
+	val := <-execute
+	if val.ReportError {
+		return fmt.Errorf("force mock error for AttachDisk: volkey %s", volKey)
+	}
+	return cloud.FakeCloudProvider.AttachDisk(ctx, project, volKey, readWrite, diskType, instanceZone, instanceName)
+}
+
 func notFoundError() *googleapi.Error {
 	return &googleapi.Error{
 		Errors: []googleapi.ErrorItem{
@@ -544,3 +564,8 @@ func invalidError() *googleapi.Error {
 		},
 	}
 }
+
+type Signal struct {
+	ReportError   bool
+	ReportRunning bool
+}
@@ -31,14 +31,18 @@ import (
 	"google.golang.org/grpc/status"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apimachinery/pkg/util/uuid"
-	"k8s.io/apimachinery/pkg/util/wait"
-	"k8s.io/client-go/util/workqueue"
+	"k8s.io/client-go/util/flowcontrol"
 	"k8s.io/klog"
 
 	"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
 	gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
 )
 
+const (
+	errorBackoffInitialDuration = 200 * time.Millisecond
+	errorBackoffMaxDuration     = 5 * time.Minute
+)
+
 type GCEControllerServer struct {
 	Driver        *GCEDriver
 	CloudProvider gce.GCECompute
@@ -54,16 +58,45 @@ type GCEControllerServer struct {
 	// Aborted error
 	volumeLocks *common.VolumeLocks
 
-	// queue is a rate limited work queue for Controller Publish/Unpublish
-	// Volume calls
-	queue workqueue.RateLimitingInterface
+	// There are several kinds of errors that are immediately retried by either
+	// the CSI sidecars or the k8s control plane. The retries consume GCP api
+	// quota, eg by doing ListVolumes, and so backoff needs to be used to
+	// prevent quota exhaustion.
+	//
+	// Examples of these errors are the per-instance GCE operation queue getting
+	// full (typically only 32 operations in flight at a time are allowed), and
+	// disks being deleted out from under a PV causing unpublish errors.
+	//
+	// While we need to backoff, we also need some semblance of fairness. In
+	// particular, volume unpublish retries happen very quickly, and with
+	// a single backoff per node these retries can prevent any other operation
+	// from making progess, even if it would succeed. Hence we track errors on
+	// node and disk pairs, backing off only for calls matching such a
+	// pair.
+	//
+	// An implication is that in the full operation queue situation, requests
+	// for new disks will not backoff the first time. This is acceptible as a
+	// single spurious call will not cause problems for quota exhaustion or make
+	// the operation queue problem worse. This is well compensated by giving
+	// disks where no problems are ocurring a chance to be processed.
+	//
+	// errorBackoff keeps track of any active backoff condition on a given node,
+	// and the time when retry of controller publish/unpublish is permissible. A
+	// node and disk pair is marked with backoff when any error is encountered
+	// by the driver during controller publish/unpublish calls.  If the
+	// controller eventually allows controller publish/publish requests for
+	// volumes (because the backoff time expired), and those requests fail, the
+	// next backoff retry time will be updated on every failure and capped at
+	// 'errorBackoffMaxDuration'. Also, any successful controller
+	// publish/unpublish call will clear the backoff condition for a node and
+	// disk.
+	errorBackoff *csiErrorBackoff
+}
 
-	// publishErrorsSeenOnNode is a list of nodes with attach/detach
-	// operation failures so those nodes shall be rate limited for all
-	// the attach/detach operations until there is an attach / detach
-	// operation succeeds
-	publishErrorsSeenOnNode map[string]bool
+type csiErrorBackoff struct {
+	backoff *flowcontrol.Backoff
 }
+type csiErrorBackoffId string
 
 type workItem struct {
 	ctx          context.Context
@@ -336,73 +369,27 @@ func (gceCS *GCEControllerServer) DeleteVolume(ctx context.Context, req *csi.Del
 	return &csi.DeleteVolumeResponse{}, nil
 }
 
-// Run starts the GCEControllerServer.
-func (gceCS *GCEControllerServer) Run() {
-	go wait.Until(gceCS.worker, 1*time.Second, wait.NeverStop)
-}
-
-func (gceCS *GCEControllerServer) worker() {
-	// Runs until workqueue is shut down
-	for gceCS.processNextWorkItem() {
-	}
-}
-
-func (gceCS *GCEControllerServer) processNextWorkItem() bool {
-	item, quit := gceCS.queue.Get()
-	if quit {
-		return false
-	}
-	defer gceCS.queue.Done(item)
-
-	workItem, ok := item.(*workItem)
-	if !ok {
-		gceCS.queue.AddRateLimited(item)
-		return true
-	}
-
-	if workItem.publishReq != nil {
-		_, err := gceCS.executeControllerPublishVolume(workItem.ctx, workItem.publishReq)
-
-		if err != nil {
-			klog.Errorf("ControllerPublishVolume failed with error: %v", err)
-		}
-	}
-
-	if workItem.unpublishReq != nil {
-		_, err := gceCS.executeControllerUnpublishVolume(workItem.ctx, workItem.unpublishReq)
-
-		if err != nil {
-			klog.Errorf("ControllerUnpublishVolume failed with error: %v", err)
-		}
-	}
-
-	gceCS.queue.Forget(item)
-	return true
-}
-
 func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error) {
-	// Only valid requests will be queued
+	// Only valid requests will be accepted
 	_, _, err := gceCS.validateControllerPublishVolumeRequest(ctx, req)
-
 	if err != nil {
 		return nil, err
 	}
 
-	// If the node is not marked, proceed the request
-	if _, found := gceCS.publishErrorsSeenOnNode[req.NodeId]; !found {
-		return gceCS.executeControllerPublishVolume(ctx, req)
+	backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
+	if gceCS.errorBackoff.blocking(backoffId) {
+		return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
 	}
 
-	// Node is marked so queue up the request. Note the original gRPC context may get canceled,
-	// so a new one is created here.
-	//
-	// Note that the original context probably has a timeout (see csiAttach in external-attacher),
-	// which is ignored.
-	gceCS.queue.AddRateLimited(&workItem{
-		ctx:        context.Background(),
-		publishReq: req,
-	})
-	return nil, status.Error(codes.Unavailable, "Request queued due to error condition on node")
+	resp, err := gceCS.executeControllerPublishVolume(ctx, req)
+	if err != nil {
+		klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
+		gceCS.errorBackoff.next(backoffId)
+	} else {
+		klog.Infof("For node %s clear backoff due to successful publish of volume %v", req.NodeId, req.VolumeId)
+		gceCS.errorBackoff.reset(backoffId)
+	}
+	return resp, err
 }
 
 func (gceCS *GCEControllerServer) validateControllerPublishVolumeRequest(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (string, *meta.Key, error) {
@@ -514,39 +501,33 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con
 
 	err = gceCS.CloudProvider.WaitForAttach(ctx, project, volKey, instanceZone, instanceName)
 	if err != nil {
-		// Mark the node and rate limit all the following attach/detach
-		// operations for this node
-		gceCS.publishErrorsSeenOnNode[nodeID] = true
 		return nil, status.Error(codes.Internal, fmt.Sprintf("unknown WaitForAttach error: %v", err))
 	}
-
-	// Attach succeeds so unmark the node
-	delete(gceCS.publishErrorsSeenOnNode, nodeID)
-
 	klog.V(4).Infof("ControllerPublishVolume succeeded for disk %v to instance %v", volKey, nodeID)
 	return pubVolResp, nil
 }
 
 func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error) {
 	// Only valid requests will be queued
 	_, _, err := gceCS.validateControllerUnpublishVolumeRequest(ctx, req)
-
 	if err != nil {
 		return nil, err
 	}
 
-	// If the node is not marked, proceed the request
-	if _, found := gceCS.publishErrorsSeenOnNode[req.NodeId]; !found {
-		return gceCS.executeControllerUnpublishVolume(ctx, req)
+	backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
+	if gceCS.errorBackoff.blocking(backoffId) {
+		return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
 	}
 
-	// Node is marked so queue up the request
-	gceCS.queue.AddRateLimited(&workItem{
-		ctx:          context.Background(),
-		unpublishReq: req,
-	})
-
-	return nil, status.Error(codes.Unavailable, "Request queued due to error condition on node")
+	resp, err := gceCS.executeControllerUnpublishVolume(ctx, req)
+	if err != nil {
+		klog.Infof("For node %s adding backoff due to error for volume %s", req.NodeId, req.VolumeId)
+		gceCS.errorBackoff.next(backoffId)
+	} else {
+		klog.Infof("For node %s clear backoff due to successful unpublish of volume %v", req.NodeId, req.VolumeId)
+		gceCS.errorBackoff.reset(backoffId)
+	}
+	return resp, err
 }
 
 func (gceCS *GCEControllerServer) validateControllerUnpublishVolumeRequest(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (string, *meta.Key, error) {
@@ -622,15 +603,9 @@ func (gceCS *GCEControllerServer) executeControllerUnpublishVolume(ctx context.C
 
 	err = gceCS.CloudProvider.DetachDisk(ctx, project, deviceName, instanceZone, instanceName)
 	if err != nil {
-		// Mark the node and rate limit all the following attach/detach
-		// operations for this node
-		gceCS.publishErrorsSeenOnNode[nodeID] = true
 		return nil, status.Error(codes.Internal, fmt.Sprintf("unknown detach error: %v", err))
 	}
 
-	// Detach succeeds so unmark the node
-	delete(gceCS.publishErrorsSeenOnNode, nodeID)
-
 	klog.V(4).Infof("ControllerUnpublishVolume succeeded for disk %v from node %v", volKey, nodeID)
 	return &csi.ControllerUnpublishVolumeResponse{}, nil
 }
@@ -1587,3 +1562,24 @@ func pickRandAndConsecutive(slice []string, n int) ([]string, error) {
 	}
 	return ret, nil
 }
+
+func newCsiErrorBackoff() *csiErrorBackoff {
+	return &csiErrorBackoff{flowcontrol.NewBackOff(errorBackoffInitialDuration, errorBackoffMaxDuration)}
+}
+
+func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId {
+	return csiErrorBackoffId(fmt.Sprintf("%s:%s", nodeId, volumeId))
+}
+
+func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool {
+	blk := b.backoff.IsInBackOffSinceUpdate(string(id), b.backoff.Clock.Now())
+	return blk
+}
+
+func (b *csiErrorBackoff) next(id csiErrorBackoffId) {
+	b.backoff.Next(string(id), b.backoff.Clock.Now())
+}
+
+func (b *csiErrorBackoff) reset(id csiErrorBackoffId) {
+	b.backoff.Reset(string(id))
+}