@@ -46,6 +46,28 @@ const (
46
46
GCEAPIVersionV1 GCEAPIVersion = "v1"
47
47
// Alpha key type
48
48
GCEAPIVersionBeta GCEAPIVersion = "beta"
49
+ // OpPollInterval is the ineterval between polling an operation
50
+ OpPollInterval = 1 * time .Second
51
+ // OpTimeout is the default total timeout for polling on an operation
52
+ OpTimeout = 2 * time .Minute
53
+ // defaultCallTimeout is the ddefault context timeout for creating/getting on an operation
54
+ defaultCallTimeout = 10 * time .Minute
55
+ // Time interval for checking on an operation
56
+ AttachDiskInitialDelay = 6 * time .Second
57
+
58
+ InsertDiskInitialDelay = 3 * time .Second
59
+
60
+ // volumeAttachmentConsecutiveErrorLimit is the number of consecutive errors we will ignore when waiting for a volume to attach/detach
61
+ //volumeAttachmentStatusConsecutiveErrorLimit = 10
62
+
63
+ // Attach typically takes 2-5 seconds (average is 2). Asking before 2 seconds is just waste of API quota.
64
+ volumeAttachmentStatusInitialDelay = 2 * time .Second
65
+ // Detach typically takes 5-10 seconds (average is 6). Asking before 5 seconds is just waste of API quota.
66
+ volumeDetachmentStatusInitialDelay = 5 * time .Second
67
+ // After the initial delay, poll attach/detach with exponential backoff (2046 seconds total)
68
+ volumeAttachmentStatusPollDelay = 2 * time .Second
69
+ volumeAttachmentStatusFactor = 2
70
+ volumeAttachmentStatusSteps = 11
49
71
)
50
72
51
73
type GCECompute interface {
@@ -306,6 +328,11 @@ func ValidateDiskParameters(disk *CloudDisk, params common.DiskParameters) error
306
328
return nil
307
329
}
308
330
331
+ // ContextWithCallTimeout returns a context with a default timeout, used for generated client calls.
332
+ func ContextWithCallTimeout () (context.Context , context.CancelFunc ) {
333
+ return context .WithTimeout (context .Background (), defaultCallTimeout )
334
+ }
335
+
309
336
func (cloud * CloudProvider ) InsertDisk (ctx context.Context , volKey * meta.Key , params common.DiskParameters , capBytes int64 , capacityRange * csi.CapacityRange , replicaZones []string , snapshotID string , multiWriter bool ) error {
310
337
klog .V (5 ).Infof ("Inserting disk %v" , volKey )
311
338
@@ -489,17 +516,20 @@ func (cloud *CloudProvider) insertZonalDisk(
489
516
}
490
517
}
491
518
519
+ callCtx , cancel := ContextWithCallTimeout ()
520
+ defer cancel ()
521
+
492
522
if gceAPIVersion == GCEAPIVersionBeta {
493
523
var insertOp * computebeta.Operation
494
524
betaDiskToCreate := convertV1DiskToBetaDisk (diskToCreate )
495
525
betaDiskToCreate .MultiWriter = multiWriter
496
- insertOp , err = cloud .betaService .Disks .Insert (cloud .project , volKey .Zone , betaDiskToCreate ).Context (ctx ).Do ()
526
+ insertOp , err = cloud .betaService .Disks .Insert (cloud .project , volKey .Zone , betaDiskToCreate ).Context (callCtx ).Do ()
497
527
if insertOp != nil {
498
528
opName = insertOp .Name
499
529
}
500
530
} else {
501
531
var insertOp * computev1.Operation
502
- insertOp , err = cloud .service .Disks .Insert (cloud .project , volKey .Zone , diskToCreate ).Context (ctx ).Do ()
532
+ insertOp , err = cloud .service .Disks .Insert (cloud .project , volKey .Zone , diskToCreate ).Context (callCtx ).Do ()
503
533
if insertOp != nil {
504
534
opName = insertOp .Name
505
535
}
@@ -523,7 +553,7 @@ func (cloud *CloudProvider) insertZonalDisk(
523
553
}
524
554
return fmt .Errorf ("unknown Insert disk error: %v" , err )
525
555
}
526
-
556
+ klog . Infof ( "Insert disk %s op id %s" , volKey . Name , opName )
527
557
err = cloud .waitForZonalOp (ctx , opName , volKey .Zone )
528
558
529
559
if err != nil {
@@ -568,6 +598,7 @@ func (cloud *CloudProvider) deleteZonalDisk(ctx context.Context, zone, name stri
568
598
}
569
599
return err
570
600
}
601
+ klog .Infof ("delete disk %s op id %s" , name , op .Name )
571
602
err = cloud .waitForZonalOp (ctx , op .Name , zone )
572
603
if err != nil {
573
604
return err
@@ -607,26 +638,53 @@ func (cloud *CloudProvider) AttachDisk(ctx context.Context, volKey *meta.Key, re
607
638
Type : diskType ,
608
639
}
609
640
610
- op , err := cloud .service .Instances .AttachDisk (cloud .project , instanceZone , instanceName , attachedDiskV1 ).Context (ctx ).Do ()
641
+ callCtx , cancel := ContextWithCallTimeout ()
642
+ defer cancel ()
643
+
644
+ op , err := cloud .service .Instances .AttachDisk (cloud .project , instanceZone , instanceName , attachedDiskV1 ).Context (callCtx ).Do ()
611
645
if err != nil {
612
- return fmt .Errorf ("failed cloud service attach disk call: %v" , err )
646
+ return fmt .Errorf ("failed cloud service attach disk %v call: %v" , volKey , err )
613
647
}
614
- err = cloud .waitForZonalOp (ctx , op .Name , instanceZone )
648
+ klog .V (5 ).Infof ("Attaching disk %s operation id is %s" , volKey , op .Name )
649
+ time .Sleep (2 * time .Second )
650
+
651
+ op2 , err2 := cloud .service .Instances .AttachDisk (cloud .project , instanceZone , instanceName , attachedDiskV1 ).Context (callCtx ).Do ()
652
+ if err2 != nil {
653
+ klog .Errorf ("failed cloud service attach disk %v call op2 %v: %v" , volKey , op2 , err2 )
654
+ //return fmt.Errorf("failed cloud service attach disk %v call op2 %v: %v", volKey, op2, err2)
655
+ } else {
656
+ klog .V (2 ).Infof ("Attaching disk %s op2 id is %s" , volKey , op2 .Name )
657
+ }
658
+
659
+ klog .V (5 ).Infof ("Attaching disk %s operation id is %s" , volKey , op .Name )
660
+
661
+ /*err = cloud.waitForZonalOp2(ctx, op.Name, instanceZone)
615
662
if err != nil {
616
- return fmt .Errorf ("failed when waiting for zonal op: %v" , err )
663
+ klog.Errorf("failed when waiting for zonal op %s on attaching volume %v: %v", op.Name, volKey, err)
664
+ //return fmt.Errorf("failed when waiting for zonal op %s on attaching volume %v: %v", op.Name, volKey, err)
665
+ }*/
666
+
667
+ err = cloud .waitForZonalOp2 (ctx , op .Name , op2 .Name , instanceName , instanceZone , volKey )
668
+ if err != nil {
669
+ return fmt .Errorf ("failed when waiting for zonal op %s op2 %s on attaching volume %v: %v" , op .Name , op2 .Name , volKey , err )
617
670
}
618
671
return nil
619
672
}
620
673
621
674
func (cloud * CloudProvider ) DetachDisk (ctx context.Context , deviceName , instanceZone , instanceName string ) error {
622
- klog .V (5 ).Infof ("Detaching disk %v from %v" , deviceName , instanceName )
623
- op , err := cloud .service .Instances .DetachDisk (cloud .project , instanceZone , instanceName , deviceName ).Context (ctx ).Do ()
675
+
676
+ callCtx , cancel := ContextWithCallTimeout ()
677
+ defer cancel ()
678
+
679
+ klog .V (5 ).Infof ("Detaching disk %s from %v" , deviceName , instanceName )
680
+ op , err := cloud .service .Instances .DetachDisk (cloud .project , instanceZone , instanceName , deviceName ).Context (callCtx ).Do ()
624
681
if err != nil {
625
682
return err
626
683
}
684
+ klog .V (5 ).Infof ("Detaching disk %s operation id is %s" , deviceName , op .Name )
627
685
err = cloud .waitForZonalOp (ctx , op .Name , instanceZone )
628
686
if err != nil {
629
- return err
687
+ return fmt . Errorf ( "failed when waiting for zonal op %s on detaching volume %s: %v" , op . Name , deviceName , err )
630
688
}
631
689
return nil
632
690
}
@@ -677,21 +735,89 @@ func (cloud *CloudProvider) getRegionalDiskTypeURI(region, diskType string) stri
677
735
return cloud .service .BasePath + fmt .Sprintf (diskTypeURITemplateRegional , cloud .project , region , diskType )
678
736
}
679
737
738
+ // SleepWithContext will wait for the timer duration to expire, or the context
739
+ // is canceled. Which ever happens first. If the context is canceled the Context's
740
+ // error will be returned.
741
+ //
742
+ // Expects Context to always return a non-nil error if the Done channel is closed.
743
+ /*func SleepWithContext(ctx Context, dur time.Duration) error {
744
+ t := time.NewTimer(dur)
745
+ defer t.Stop()
746
+
747
+ select {
748
+ case <-t.C:
749
+ break
750
+ case <-ctx.Done():
751
+ return ctx.Err()
752
+ }
753
+
754
+ return nil
755
+ }*/
756
+
680
757
func (cloud * CloudProvider ) waitForZonalOp (ctx context.Context , opName string , zone string ) error {
681
758
// The v1 API can query for v1, alpha, or beta operations.
682
759
svc := cloud .service
683
760
project := cloud .project
684
- return wait .Poll (3 * time .Second , 5 * time .Minute , func () (bool , error ) {
761
+ timeout := OpTimeout
762
+ if deadline , ok := ctx .Deadline (); ok {
763
+ timeout = time .Until (deadline )
764
+ klog .Infof ("Getting timeout from passed context opname %s %v" , opName , timeout )
765
+ }
766
+ //callCtx, cancel := ContextWithCallTimeout()
767
+ //defer cancel()
768
+
769
+ return wait .Poll (OpPollInterval , timeout , func () (bool , error ) {
685
770
pollOp , err := svc .ZoneOperations .Get (project , zone , opName ).Context (ctx ).Do ()
686
771
if err != nil {
687
- klog .Errorf ("WaitForOp(op: %s, zone: %#v) failed to poll the operation" , opName , zone )
772
+ klog .Errorf ("WaitForOp(op: %s, zone: %#v) failed to poll the operation: %v " , opName , zone , err )
688
773
return false , err
689
774
}
690
775
done , err := opIsDone (pollOp )
776
+ klog .V (4 ).Infof ("checking op %s is done %t" , opName , done )
691
777
return done , err
692
778
})
693
779
}
694
780
781
+ func (cloud * CloudProvider ) waitForZonalOp2 (ctx context.Context , opName , opName2 string , instance , zone string , volKey * meta.Key ) error {
782
+ // The v1 API can query for v1, alpha, or beta operations.
783
+ svc := cloud .service
784
+ project := cloud .project
785
+ timeout := OpTimeout
786
+ if deadline , ok := ctx .Deadline (); ok {
787
+ timeout = time .Until (deadline )
788
+ klog .Infof ("Getting timeout from passed context opname %s %v" , opName , timeout )
789
+ }
790
+ //callCtx, cancel := ContextWithCallTimeout()
791
+ //defer cancel()
792
+
793
+ return wait .Poll (OpPollInterval , timeout , func () (bool , error ) {
794
+ pollOp , err := svc .ZoneOperations .Get (project , zone , opName ).Context (ctx ).Do ()
795
+ if err != nil {
796
+ klog .Errorf ("WaitForOp(op: %s, zone: %#v) failed to poll the operation: %v" , opName , zone , err )
797
+ return false , err
798
+ }
799
+ done , err := opIsDone (pollOp )
800
+ klog .V (4 ).Infof ("checking op %s is done %t %v" , opName , done , err )
801
+
802
+ pollOp , err = svc .ZoneOperations .Get (project , zone , opName2 ).Context (ctx ).Do ()
803
+ if err != nil {
804
+ klog .Errorf ("WaitForOp(op: %s, zone: %#v) failed to poll the operation2: %v" , opName2 , zone , err )
805
+ return false , err
806
+ }
807
+ done2 , err2 := opIsDone (pollOp )
808
+ klog .V (4 ).Infof ("checking op2 %s is done %t %v" , opName2 , done2 , err2 )
809
+
810
+ if done || done2 {
811
+ _ , err = cloud .WaitForAttach2 (ctx , volKey , zone , instance )
812
+ if err != nil {
813
+ klog .Errorf ("volkey %v unknown WaitForAttach error: %v" , volKey , err )
814
+ //return nil, status.Error(codes.Internal, fmt.Sprintf("unknown WaitForAttach error: %v", err))
815
+ }
816
+ }
817
+ return done && done2 , err
818
+ })
819
+ }
820
+
695
821
func (cloud * CloudProvider ) waitForRegionalOp (ctx context.Context , opName string , region string ) error {
696
822
// The v1 API can query for v1, alpha, or beta operations.
697
823
return wait .Poll (3 * time .Second , 5 * time .Minute , func () (bool , error ) {
@@ -723,7 +849,7 @@ func (cloud *CloudProvider) WaitForAttach(ctx context.Context, volKey *meta.Key,
723
849
klog .V (5 ).Infof ("Waiting for attach of disk %v to instance %v to complete..." , volKey .Name , instanceName )
724
850
start := time .Now ()
725
851
return wait .Poll (5 * time .Second , 2 * time .Minute , func () (bool , error ) {
726
- klog .V (6 ).Infof ("Polling for attach of disk %v to instance %v to complete for %v" , volKey .Name , instanceName , time .Since (start ))
852
+ klog .V (5 ).Infof ("Polling for attach of disk %v to instance %v to complete for %v" , volKey .Name , instanceName , time .Since (start ))
727
853
disk , err := cloud .GetDisk (ctx , volKey , GCEAPIVersionV1 )
728
854
if err != nil {
729
855
return false , fmt .Errorf ("GetDisk failed to get disk: %v" , err )
@@ -742,8 +868,35 @@ func (cloud *CloudProvider) WaitForAttach(ctx context.Context, volKey *meta.Key,
742
868
})
743
869
}
744
870
871
+ func (cloud * CloudProvider ) WaitForAttach2 (ctx context.Context , volKey * meta.Key , instanceZone , instanceName string ) (bool , error ) {
872
+ klog .V (5 ).Infof ("Waiting for attach of disk %v to instance %v to complete..." , volKey .Name , instanceName )
873
+ start := time .Now ()
874
+ //return wait.Poll(1*time.Second, 2*time.Second, func() (bool, error) {
875
+ klog .V (5 ).Infof ("Polling for attach of disk %v to instance %v to complete for %v" , volKey .Name , instanceName , time .Since (start ))
876
+ disk , err := cloud .GetDisk (ctx , volKey , GCEAPIVersionV1 )
877
+ if err != nil {
878
+ return false , fmt .Errorf ("GetDisk failed to get disk: %v" , err )
879
+ }
880
+
881
+ if disk == nil {
882
+ return false , fmt .Errorf ("Disk %v could not be found" , volKey .Name )
883
+ }
884
+
885
+ for _ , user := range disk .GetUsers () {
886
+ if strings .Contains (user , instanceName ) && strings .Contains (user , instanceZone ) {
887
+ klog .Infof ("disk %v is attachd " , volKey )
888
+ return true , nil
889
+ }
890
+ }
891
+ return false , nil
892
+ //})
893
+ }
894
+
745
895
func opIsDone (op * computev1.Operation ) (bool , error ) {
746
- if op == nil || op .Status != operationStatusDone {
896
+ if op == nil {
897
+ return true , fmt .Errorf ("operation is nil" )
898
+ }
899
+ if op .Status != operationStatusDone {
747
900
return false , nil
748
901
}
749
902
if op .Error != nil && len (op .Error .Errors ) > 0 && op .Error .Errors [0 ] != nil {
0 commit comments