Skip to content

Commit 28d5cc1

Browse files
committed
WIP Update gce operation timeout
Update gce operation timeout to 2 mins instead of 5 mins. The 90% of attach/detach disk is less than 15 second, and create volume is less than 1 min. Reduce the timeout of volume operation so that it can recover from previous operation quicker in case operation is dropped.
1 parent 53e8abd commit 28d5cc1

File tree

2 files changed

+77
-10
lines changed

2 files changed

+77
-10
lines changed

deploy/kubernetes/base/controller/controller.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ spec:
3232
- "--enable-leader-election"
3333
- "--leader-election-type=leases"
3434
- "--leader-election-namespace=$(PDCSI_NAMESPACE)"
35-
- "--timeout=250s"
35+
- "--timeout=180s"
3636
- "--extra-create-metadata"
3737
# - "--run-controller-service=false" # disable the controller service of the CSI driver
3838
# - "--run-node-service=false" # disable the node service of the CSI driver
@@ -52,7 +52,7 @@ spec:
5252
- "--metrics-address=:22012"
5353
- "--leader-election"
5454
- "--leader-election-namespace=$(PDCSI_NAMESPACE)"
55-
- "--timeout=250s"
55+
- "--timeout=90s"
5656
env:
5757
- name: PDCSI_NAMESPACE
5858
valueFrom:

pkg/gce-cloud-provider/compute/gce-compute.go

+75-8
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,28 @@ const (
4646
GCEAPIVersionV1 GCEAPIVersion = "v1"
4747
// Alpha key type
4848
GCEAPIVersionBeta GCEAPIVersion = "beta"
49+
// OpPollInterval is the ineterval between polling an operation
50+
OpPollInterval = 2 * time.Second
51+
// OpTimeout is the default total timeout for polling on an operation
52+
OpTimeout = 2 * time.Minute
53+
// defaultCallTimeout is the ddefault context timeout for creating/getting on an operation
54+
defaultCallTimeout = 3 * time.Second
55+
// Time interval for checking on an operation
56+
AttachDiskInitialDelay = 6 * time.Second
57+
58+
InsertDiskInitialDelay = 3 * time.Second
59+
60+
// volumeAttachmentConsecutiveErrorLimit is the number of consecutive errors we will ignore when waiting for a volume to attach/detach
61+
//volumeAttachmentStatusConsecutiveErrorLimit = 10
62+
63+
// Attach typically takes 2-5 seconds (average is 2). Asking before 2 seconds is just waste of API quota.
64+
volumeAttachmentStatusInitialDelay = 2 * time.Second
65+
// Detach typically takes 5-10 seconds (average is 6). Asking before 5 seconds is just waste of API quota.
66+
volumeDetachmentStatusInitialDelay = 5 * time.Second
67+
// After the initial delay, poll attach/detach with exponential backoff (2046 seconds total)
68+
volumeAttachmentStatusPollDelay = 2 * time.Second
69+
volumeAttachmentStatusFactor = 2
70+
volumeAttachmentStatusSteps = 11
4971
)
5072

5173
type GCECompute interface {
@@ -306,6 +328,11 @@ func ValidateDiskParameters(disk *CloudDisk, params common.DiskParameters) error
306328
return nil
307329
}
308330

331+
// ContextWithCallTimeout returns a context with a default timeout, used for generated client calls.
332+
func ContextWithCallTimeout() (context.Context, context.CancelFunc) {
333+
return context.WithTimeout(context.Background(), defaultCallTimeout)
334+
}
335+
309336
func (cloud *CloudProvider) InsertDisk(ctx context.Context, volKey *meta.Key, params common.DiskParameters, capBytes int64, capacityRange *csi.CapacityRange, replicaZones []string, snapshotID string, multiWriter bool) error {
310337
klog.V(5).Infof("Inserting disk %v", volKey)
311338

@@ -607,26 +634,35 @@ func (cloud *CloudProvider) AttachDisk(ctx context.Context, volKey *meta.Key, re
607634
Type: diskType,
608635
}
609636

610-
op, err := cloud.service.Instances.AttachDisk(cloud.project, instanceZone, instanceName, attachedDiskV1).Context(ctx).Do()
637+
callCtx, cancel := ContextWithCallTimeout()
638+
defer cancel()
639+
640+
op, err := cloud.service.Instances.AttachDisk(cloud.project, instanceZone, instanceName, attachedDiskV1).Context(callCtx).Do()
611641
if err != nil {
612642
return fmt.Errorf("failed cloud service attach disk call: %v", err)
613643
}
644+
klog.V(5).Infof("Attaching disk %s operation id is %s", volKey, op.Name)
614645
err = cloud.waitForZonalOp(ctx, op.Name, instanceZone)
615646
if err != nil {
616-
return fmt.Errorf("failed when waiting for zonal op: %v", err)
647+
return fmt.Errorf("failed when waiting for zonal op %s on attaching volume %v: %v", op.Name, volKey, err)
617648
}
618649
return nil
619650
}
620651

621652
func (cloud *CloudProvider) DetachDisk(ctx context.Context, deviceName, instanceZone, instanceName string) error {
622-
klog.V(5).Infof("Detaching disk %v from %v", deviceName, instanceName)
623-
op, err := cloud.service.Instances.DetachDisk(cloud.project, instanceZone, instanceName, deviceName).Context(ctx).Do()
653+
654+
callCtx, cancel := ContextWithCallTimeout()
655+
defer cancel()
656+
657+
klog.V(5).Infof("Detaching disk %s from %v", deviceName, instanceName)
658+
op, err := cloud.service.Instances.DetachDisk(cloud.project, instanceZone, instanceName, deviceName).Context(callCtx).Do()
624659
if err != nil {
625660
return err
626661
}
662+
klog.V(5).Infof("Detaching disk %s operation id is %s", deviceName, op.Name)
627663
err = cloud.waitForZonalOp(ctx, op.Name, instanceZone)
628664
if err != nil {
629-
return err
665+
return fmt.Errorf("failed when waiting for zonal op %s on detaching volume %s: %v", op.Name, deviceName, err)
630666
}
631667
return nil
632668
}
@@ -677,17 +713,45 @@ func (cloud *CloudProvider) getRegionalDiskTypeURI(region, diskType string) stri
677713
return cloud.service.BasePath + fmt.Sprintf(diskTypeURITemplateRegional, cloud.project, region, diskType)
678714
}
679715

716+
// SleepWithContext will wait for the timer duration to expire, or the context
717+
// is canceled. Which ever happens first. If the context is canceled the Context's
718+
// error will be returned.
719+
//
720+
// Expects Context to always return a non-nil error if the Done channel is closed.
721+
/*func SleepWithContext(ctx Context, dur time.Duration) error {
722+
t := time.NewTimer(dur)
723+
defer t.Stop()
724+
725+
select {
726+
case <-t.C:
727+
break
728+
case <-ctx.Done():
729+
return ctx.Err()
730+
}
731+
732+
return nil
733+
}*/
734+
680735
func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, opName string, zone string) error {
681736
// The v1 API can query for v1, alpha, or beta operations.
682737
svc := cloud.service
683738
project := cloud.project
684-
return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) {
685-
pollOp, err := svc.ZoneOperations.Get(project, zone, opName).Context(ctx).Do()
739+
timeout := OpTimeout
740+
if deadline, ok := ctx.Deadline(); ok {
741+
timeout = time.Until(deadline)
742+
klog.Infof("Getting timeout from passed context %v", timeout)
743+
}
744+
callCtx, cancel := ContextWithCallTimeout()
745+
defer cancel()
746+
747+
return wait.Poll(OpPollInterval, timeout, func() (bool, error) {
748+
pollOp, err := svc.ZoneOperations.Get(project, zone, opName).Context(callCtx).Do()
686749
if err != nil {
687750
klog.Errorf("WaitForOp(op: %s, zone: %#v) failed to poll the operation", opName, zone)
688751
return false, err
689752
}
690753
done, err := opIsDone(pollOp)
754+
klog.V(4).Infof("checking op %s is done %t", opName, done)
691755
return done, err
692756
})
693757
}
@@ -743,7 +807,10 @@ func (cloud *CloudProvider) WaitForAttach(ctx context.Context, volKey *meta.Key,
743807
}
744808

745809
func opIsDone(op *computev1.Operation) (bool, error) {
746-
if op == nil || op.Status != operationStatusDone {
810+
if op == nil {
811+
return true, fmt.Errorf("operation is nil")
812+
}
813+
if op.Status != operationStatusDone {
747814
return false, nil
748815
}
749816
if op.Error != nil && len(op.Error.Errors) > 0 && op.Error.Errors[0] != nil {

0 commit comments

Comments
 (0)