@@ -46,6 +46,28 @@ const (
46
46
GCEAPIVersionV1 GCEAPIVersion = "v1"
47
47
// Alpha key type
48
48
GCEAPIVersionBeta GCEAPIVersion = "beta"
49
+ // OpPollInterval is the ineterval between polling an operation
50
+ OpPollInterval = 1 * time .Second
51
+ // OpTimeout is the default total timeout for polling on an operation
52
+ OpTimeout = 2 * time .Minute
53
+ // defaultCallTimeout is the ddefault context timeout for creating/getting on an operation
54
+ defaultCallTimeout = 3 * time .Second
55
+ // Time interval for checking on an operation
56
+ AttachDiskInitialDelay = 6 * time .Second
57
+
58
+ InsertDiskInitialDelay = 3 * time .Second
59
+
60
+ // volumeAttachmentConsecutiveErrorLimit is the number of consecutive errors we will ignore when waiting for a volume to attach/detach
61
+ //volumeAttachmentStatusConsecutiveErrorLimit = 10
62
+
63
+ // Attach typically takes 2-5 seconds (average is 2). Asking before 2 seconds is just waste of API quota.
64
+ volumeAttachmentStatusInitialDelay = 2 * time .Second
65
+ // Detach typically takes 5-10 seconds (average is 6). Asking before 5 seconds is just waste of API quota.
66
+ volumeDetachmentStatusInitialDelay = 5 * time .Second
67
+ // After the initial delay, poll attach/detach with exponential backoff (2046 seconds total)
68
+ volumeAttachmentStatusPollDelay = 2 * time .Second
69
+ volumeAttachmentStatusFactor = 2
70
+ volumeAttachmentStatusSteps = 11
49
71
)
50
72
51
73
type GCECompute interface {
@@ -306,6 +328,11 @@ func ValidateDiskParameters(disk *CloudDisk, params common.DiskParameters) error
306
328
return nil
307
329
}
308
330
331
+ // ContextWithCallTimeout returns a context with a default timeout, used for generated client calls.
332
+ func ContextWithCallTimeout () (context.Context , context.CancelFunc ) {
333
+ return context .WithTimeout (context .Background (), defaultCallTimeout )
334
+ }
335
+
309
336
func (cloud * CloudProvider ) InsertDisk (ctx context.Context , volKey * meta.Key , params common.DiskParameters , capBytes int64 , capacityRange * csi.CapacityRange , replicaZones []string , snapshotID string , multiWriter bool ) error {
310
337
klog .V (5 ).Infof ("Inserting disk %v" , volKey )
311
338
@@ -489,17 +516,20 @@ func (cloud *CloudProvider) insertZonalDisk(
489
516
}
490
517
}
491
518
519
+ callCtx , cancel := ContextWithCallTimeout ()
520
+ defer cancel ()
521
+
492
522
if gceAPIVersion == GCEAPIVersionBeta {
493
523
var insertOp * computebeta.Operation
494
524
betaDiskToCreate := convertV1DiskToBetaDisk (diskToCreate )
495
525
betaDiskToCreate .MultiWriter = multiWriter
496
- insertOp , err = cloud .betaService .Disks .Insert (cloud .project , volKey .Zone , betaDiskToCreate ).Context (ctx ).Do ()
526
+ insertOp , err = cloud .betaService .Disks .Insert (cloud .project , volKey .Zone , betaDiskToCreate ).Context (callCtx ).Do ()
497
527
if insertOp != nil {
498
528
opName = insertOp .Name
499
529
}
500
530
} else {
501
531
var insertOp * computev1.Operation
502
- insertOp , err = cloud .service .Disks .Insert (cloud .project , volKey .Zone , diskToCreate ).Context (ctx ).Do ()
532
+ insertOp , err = cloud .service .Disks .Insert (cloud .project , volKey .Zone , diskToCreate ).Context (callCtx ).Do ()
503
533
if insertOp != nil {
504
534
opName = insertOp .Name
505
535
}
@@ -523,7 +553,7 @@ func (cloud *CloudProvider) insertZonalDisk(
523
553
}
524
554
return fmt .Errorf ("unknown Insert disk error: %v" , err )
525
555
}
526
-
556
+ klog . Infof ( "Insert disk %s op id %s" , volKey . Name , opName )
527
557
err = cloud .waitForZonalOp (ctx , opName , volKey .Zone )
528
558
529
559
if err != nil {
@@ -568,6 +598,7 @@ func (cloud *CloudProvider) deleteZonalDisk(ctx context.Context, zone, name stri
568
598
}
569
599
return err
570
600
}
601
+ klog .Infof ("delete disk %s op id %s" , name , op .Name )
571
602
err = cloud .waitForZonalOp (ctx , op .Name , zone )
572
603
if err != nil {
573
604
return err
@@ -607,26 +638,47 @@ func (cloud *CloudProvider) AttachDisk(ctx context.Context, volKey *meta.Key, re
607
638
Type : diskType ,
608
639
}
609
640
610
- op , err := cloud .service .Instances .AttachDisk (cloud .project , instanceZone , instanceName , attachedDiskV1 ).Context (ctx ).Do ()
641
+ callCtx , cancel := ContextWithCallTimeout ()
642
+ defer cancel ()
643
+
644
+ op , err := cloud .service .Instances .AttachDisk (cloud .project , instanceZone , instanceName , attachedDiskV1 ).Context (callCtx ).Do ()
611
645
if err != nil {
612
- return fmt .Errorf ("failed cloud service attach disk call: %v" , err )
646
+ return fmt .Errorf ("failed cloud service attach disk %v call: %v" , volKey , err )
613
647
}
648
+ klog .V (5 ).Infof ("Attaching disk %s operation id is %s" , volKey , op .Name )
649
+ time .Sleep (2 * time .Second )
650
+
651
+ op2 , err2 := cloud .service .Instances .AttachDisk (cloud .project , instanceZone , instanceName , attachedDiskV1 ).Context (callCtx ).Do ()
652
+ if err2 != nil {
653
+ klog .Errorf ("failed cloud service attach disk %v call op2 %v: %v" , volKey , op2 , err2 )
654
+ //return fmt.Errorf("failed cloud service attach disk %v call op2 %v: %v", volKey, op2, err2)
655
+ } else {
656
+ klog .V (2 ).Infof ("Attaching disk %s operation2 id is %s" , volKey , op2 .Name )
657
+ }
658
+
659
+ klog .V (5 ).Infof ("Attaching disk %s operation id is %s" , volKey , op .Name )
660
+
614
661
err = cloud .waitForZonalOp (ctx , op .Name , instanceZone )
615
662
if err != nil {
616
- return fmt .Errorf ("failed when waiting for zonal op: %v" , err )
663
+ return fmt .Errorf ("failed when waiting for zonal op %s on attaching volume %v : %v" , op . Name , volKey , err )
617
664
}
618
665
return nil
619
666
}
620
667
621
668
func (cloud * CloudProvider ) DetachDisk (ctx context.Context , deviceName , instanceZone , instanceName string ) error {
622
- klog .V (5 ).Infof ("Detaching disk %v from %v" , deviceName , instanceName )
623
- op , err := cloud .service .Instances .DetachDisk (cloud .project , instanceZone , instanceName , deviceName ).Context (ctx ).Do ()
669
+
670
+ callCtx , cancel := ContextWithCallTimeout ()
671
+ defer cancel ()
672
+
673
+ klog .V (5 ).Infof ("Detaching disk %s from %v" , deviceName , instanceName )
674
+ op , err := cloud .service .Instances .DetachDisk (cloud .project , instanceZone , instanceName , deviceName ).Context (callCtx ).Do ()
624
675
if err != nil {
625
676
return err
626
677
}
678
+ klog .V (5 ).Infof ("Detaching disk %s operation id is %s" , deviceName , op .Name )
627
679
err = cloud .waitForZonalOp (ctx , op .Name , instanceZone )
628
680
if err != nil {
629
- return err
681
+ return fmt . Errorf ( "failed when waiting for zonal op %s on detaching volume %s: %v" , op . Name , deviceName , err )
630
682
}
631
683
return nil
632
684
}
@@ -677,17 +729,45 @@ func (cloud *CloudProvider) getRegionalDiskTypeURI(region, diskType string) stri
677
729
return cloud .service .BasePath + fmt .Sprintf (diskTypeURITemplateRegional , cloud .project , region , diskType )
678
730
}
679
731
732
+ // SleepWithContext will wait for the timer duration to expire, or the context
733
+ // is canceled. Which ever happens first. If the context is canceled the Context's
734
+ // error will be returned.
735
+ //
736
+ // Expects Context to always return a non-nil error if the Done channel is closed.
737
+ /*func SleepWithContext(ctx Context, dur time.Duration) error {
738
+ t := time.NewTimer(dur)
739
+ defer t.Stop()
740
+
741
+ select {
742
+ case <-t.C:
743
+ break
744
+ case <-ctx.Done():
745
+ return ctx.Err()
746
+ }
747
+
748
+ return nil
749
+ }*/
750
+
680
751
func (cloud * CloudProvider ) waitForZonalOp (ctx context.Context , opName string , zone string ) error {
681
752
// The v1 API can query for v1, alpha, or beta operations.
682
753
svc := cloud .service
683
754
project := cloud .project
684
- return wait .Poll (3 * time .Second , 5 * time .Minute , func () (bool , error ) {
755
+ timeout := OpTimeout
756
+ if deadline , ok := ctx .Deadline (); ok {
757
+ timeout = time .Until (deadline )
758
+ klog .Infof ("Getting timeout from passed context opname %s %v" , opName , timeout )
759
+ }
760
+ //callCtx, cancel := ContextWithCallTimeout()
761
+ //defer cancel()
762
+
763
+ return wait .Poll (OpPollInterval , timeout , func () (bool , error ) {
685
764
pollOp , err := svc .ZoneOperations .Get (project , zone , opName ).Context (ctx ).Do ()
686
765
if err != nil {
687
- klog .Errorf ("WaitForOp(op: %s, zone: %#v) failed to poll the operation" , opName , zone )
766
+ klog .Errorf ("WaitForOp(op: %s, zone: %#v) failed to poll the operation: %v " , opName , zone , err )
688
767
return false , err
689
768
}
690
769
done , err := opIsDone (pollOp )
770
+ klog .V (4 ).Infof ("checking op %s is done %t" , opName , done )
691
771
return done , err
692
772
})
693
773
}
@@ -743,7 +823,10 @@ func (cloud *CloudProvider) WaitForAttach(ctx context.Context, volKey *meta.Key,
743
823
}
744
824
745
825
func opIsDone (op * computev1.Operation ) (bool , error ) {
746
- if op == nil || op .Status != operationStatusDone {
826
+ if op == nil {
827
+ return true , fmt .Errorf ("operation is nil" )
828
+ }
829
+ if op .Status != operationStatusDone {
747
830
return false , nil
748
831
}
749
832
if op .Error != nil && len (op .Error .Errors ) > 0 && op .Error .Errors [0 ] != nil {
0 commit comments