Skip to content

Commit 8b0a9cf

Browse files
committed
Update gce operation timeout
Update gce operation timeout to 2 mins instead of 5 mins. The 90% of attach/detach disk is less than 15 second, and create volume is less than 1 min. Reduce the timeout of volume operation so that it can recover from previous operation quicker in case operation is dropped.
1 parent 53e8abd commit 8b0a9cf

File tree

2 files changed

+15
-6
lines changed

2 files changed

+15
-6
lines changed

deploy/kubernetes/base/controller/controller.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ spec:
5252
- "--metrics-address=:22012"
5353
- "--leader-election"
5454
- "--leader-election-namespace=$(PDCSI_NAMESPACE)"
55-
- "--timeout=250s"
55+
- "--timeout=60s"
5656
env:
5757
- name: PDCSI_NAMESPACE
5858
valueFrom:

pkg/gce-cloud-provider/compute/gce-compute.go

+14-5
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ const (
4646
GCEAPIVersionV1 GCEAPIVersion = "v1"
4747
// Alpha key type
4848
GCEAPIVersionBeta GCEAPIVersion = "beta"
49+
// Timeout for checking on an operation
50+
OpTimeout = 2 * time.Minute
51+
// Time interval for checking on an operation
52+
OpInterval = 3 * time.Second
4953
)
5054

5155
type GCECompute interface {
@@ -611,22 +615,24 @@ func (cloud *CloudProvider) AttachDisk(ctx context.Context, volKey *meta.Key, re
611615
if err != nil {
612616
return fmt.Errorf("failed cloud service attach disk call: %v", err)
613617
}
618+
klog.V(5).Infof("Attaching disk %s operation id is %s", volKey, op.Name)
614619
err = cloud.waitForZonalOp(ctx, op.Name, instanceZone)
615620
if err != nil {
616-
return fmt.Errorf("failed when waiting for zonal op: %v", err)
621+
return fmt.Errorf("failed when waiting for zonal op %s on attaching volume %v: %v", op.Name, volKey, err)
617622
}
618623
return nil
619624
}
620625

621626
func (cloud *CloudProvider) DetachDisk(ctx context.Context, deviceName, instanceZone, instanceName string) error {
622-
klog.V(5).Infof("Detaching disk %v from %v", deviceName, instanceName)
627+
klog.V(5).Infof("Detaching disk %s from %v", deviceName, instanceName)
623628
op, err := cloud.service.Instances.DetachDisk(cloud.project, instanceZone, instanceName, deviceName).Context(ctx).Do()
624629
if err != nil {
625630
return err
626631
}
632+
klog.V(5).Infof("Detaching disk %s operation id is %s", deviceName, op.Name)
627633
err = cloud.waitForZonalOp(ctx, op.Name, instanceZone)
628634
if err != nil {
629-
return err
635+
return fmt.Errorf("failed when waiting for zonal op %s on detaching volume %s: %v", op.Name, deviceName, err)
630636
}
631637
return nil
632638
}
@@ -681,7 +687,7 @@ func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, opName string, z
681687
// The v1 API can query for v1, alpha, or beta operations.
682688
svc := cloud.service
683689
project := cloud.project
684-
return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) {
690+
return wait.Poll(OpInterval, OpTimeout, func() (bool, error) {
685691
pollOp, err := svc.ZoneOperations.Get(project, zone, opName).Context(ctx).Do()
686692
if err != nil {
687693
klog.Errorf("WaitForOp(op: %s, zone: %#v) failed to poll the operation", opName, zone)
@@ -743,7 +749,10 @@ func (cloud *CloudProvider) WaitForAttach(ctx context.Context, volKey *meta.Key,
743749
}
744750

745751
func opIsDone(op *computev1.Operation) (bool, error) {
746-
if op == nil || op.Status != operationStatusDone {
752+
if op == nil {
753+
return true, fmt.Errorf("operation is nil")
754+
}
755+
if op.Status != operationStatusDone {
747756
return false, nil
748757
}
749758
if op.Error != nil && len(op.Error.Errors) > 0 && op.Error.Errors[0] != nil {

0 commit comments

Comments
 (0)