Skip to content

Commit aa1e558

Browse files
committed
Update gce operation timeout
Update gce operation timeout to 2 mins instead of 5 mins. The 90% of attach/detach disk is less than 15 second, and create volume is less than 1 min. Reduce the timeout of volume operation so that it can recover from previous operation quicker in case operation is dropped.
1 parent 53e8abd commit aa1e558

File tree

6 files changed

+82
-11
lines changed

6 files changed

+82
-11
lines changed

deploy/kubernetes/base/controller/controller.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ spec:
3232
- "--enable-leader-election"
3333
- "--leader-election-type=leases"
3434
- "--leader-election-namespace=$(PDCSI_NAMESPACE)"
35-
- "--timeout=250s"
35+
- "--timeout=180s"
3636
- "--extra-create-metadata"
3737
# - "--run-controller-service=false" # disable the controller service of the CSI driver
3838
# - "--run-node-service=false" # disable the node service of the CSI driver
@@ -52,7 +52,7 @@ spec:
5252
- "--metrics-address=:22012"
5353
- "--leader-election"
5454
- "--leader-election-namespace=$(PDCSI_NAMESPACE)"
55-
- "--timeout=250s"
55+
- "--timeout=60s"
5656
env:
5757
- name: PDCSI_NAMESPACE
5858
valueFrom:

deploy/kubernetes/images/alpha/image.yaml

+4-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ metadata:
44
name: imagetag-gcepd-driver-alpha-win
55
imageTag:
66
name: gke.gcr.io/gcp-compute-persistent-disk-csi-driver
7-
newName: gcr.io/gke-release-staging/gcp-compute-persistent-disk-csi-driver
8-
newTag: "v1.0.1-gke.9"
7+
#newName: gcr.io/gke-release-staging/gcp-compute-persistent-disk-csi-driver
8+
#newTag: "v1.0.1-gke.9"
9+
newName: gcr.io/jinxu-gke-multi-cloud-dev/gce-pd-windows
10+
newTag: v5
911
---
1012

1113
apiVersion: builtin

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ require (
99
github.com/golang/protobuf v1.4.2
1010
github.com/google/uuid v1.1.1
1111
github.com/hashicorp/go-multierror v1.0.0 // indirect
12-
github.com/kubernetes-csi/csi-proxy/client v0.2.1
12+
github.com/kubernetes-csi/csi-proxy/client v0.2.2
1313
github.com/kubernetes-csi/csi-test/v3 v3.0.0
1414
github.com/onsi/ginkgo v1.11.0
1515
github.com/onsi/gomega v1.7.1

go.sum

+3
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,11 @@ github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
476476
github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA=
477477
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
478478
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
479+
github.com/kubernetes-csi/csi-proxy v0.2.2 h1:LqablYFEGw7FYBjwoh5TeXFzlcx8C+YQjKfGy6fFWJs=
479480
github.com/kubernetes-csi/csi-proxy/client v0.2.1 h1:n21d2U9HvgQ6jfJayafRv8kXXtLvnRNEqoD0mQNucKc=
480481
github.com/kubernetes-csi/csi-proxy/client v0.2.1/go.mod h1:6ptQQmti5QHwBxSsh8Cy00oGdogj0JXewFnu8FFjgOs=
482+
github.com/kubernetes-csi/csi-proxy/client v0.2.2 h1:VpMddHnbYA1oBeU5nrisdyrpOAAT0HqME7fsTi6BG2w=
483+
github.com/kubernetes-csi/csi-proxy/client v0.2.2/go.mod h1:6ptQQmti5QHwBxSsh8Cy00oGdogj0JXewFnu8FFjgOs=
481484
github.com/kubernetes-csi/csi-test/v3 v3.0.0 h1:mVsfA4J67uNm8fdF/Pr84oMqL92qjIhjWbEUH8zv1fU=
482485
github.com/kubernetes-csi/csi-test/v3 v3.0.0/go.mod h1:VdIKGnDZHOjg4M5yd0OZICtsoEzdn64d0K33N6dm35Q=
483486
github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k=

pkg/gce-cloud-provider/compute/gce-compute.go

+18-5
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ const (
4646
GCEAPIVersionV1 GCEAPIVersion = "v1"
4747
// Alpha key type
4848
GCEAPIVersionBeta GCEAPIVersion = "beta"
49+
// Timeout for checking on an operation
50+
OpTimeout = 2 * time.Minute
51+
// Time interval for checking on an operation
52+
OpInterval = 3 * time.Second
4953
)
5054

5155
type GCECompute interface {
@@ -611,22 +615,24 @@ func (cloud *CloudProvider) AttachDisk(ctx context.Context, volKey *meta.Key, re
611615
if err != nil {
612616
return fmt.Errorf("failed cloud service attach disk call: %v", err)
613617
}
618+
klog.V(5).Infof("Attaching disk %s operation id is %s", volKey, op.Name)
614619
err = cloud.waitForZonalOp(ctx, op.Name, instanceZone)
615620
if err != nil {
616-
return fmt.Errorf("failed when waiting for zonal op: %v", err)
621+
return fmt.Errorf("failed when waiting for zonal op %s on attaching volume %v: %v", op.Name, volKey, err)
617622
}
618623
return nil
619624
}
620625

621626
func (cloud *CloudProvider) DetachDisk(ctx context.Context, deviceName, instanceZone, instanceName string) error {
622-
klog.V(5).Infof("Detaching disk %v from %v", deviceName, instanceName)
627+
klog.V(5).Infof("Detaching disk %s from %v", deviceName, instanceName)
623628
op, err := cloud.service.Instances.DetachDisk(cloud.project, instanceZone, instanceName, deviceName).Context(ctx).Do()
624629
if err != nil {
625630
return err
626631
}
632+
klog.V(5).Infof("Detaching disk %s operation id is %s", deviceName, op.Name)
627633
err = cloud.waitForZonalOp(ctx, op.Name, instanceZone)
628634
if err != nil {
629-
return err
635+
return fmt.Errorf("failed when waiting for zonal op %s on detaching volume %s: %v", op.Name, deviceName, err)
630636
}
631637
return nil
632638
}
@@ -681,7 +687,11 @@ func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, opName string, z
681687
// The v1 API can query for v1, alpha, or beta operations.
682688
svc := cloud.service
683689
project := cloud.project
684-
return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) {
690+
timeout := OpTimeout
691+
if deadline, ok := ctx.Deadline(); ok {
692+
timeout = time.Until(deadline)
693+
}
694+
return wait.Poll(OpInterval, timeout, func() (bool, error) {
685695
pollOp, err := svc.ZoneOperations.Get(project, zone, opName).Context(ctx).Do()
686696
if err != nil {
687697
klog.Errorf("WaitForOp(op: %s, zone: %#v) failed to poll the operation", opName, zone)
@@ -743,7 +753,10 @@ func (cloud *CloudProvider) WaitForAttach(ctx context.Context, volKey *meta.Key,
743753
}
744754

745755
func opIsDone(op *computev1.Operation) (bool, error) {
746-
if op == nil || op.Status != operationStatusDone {
756+
if op == nil {
757+
return true, fmt.Errorf("operation is nil")
758+
}
759+
if op.Status != operationStatusDone {
747760
return false, nil
748761
}
749762
if op.Error != nil && len(op.Error.Errors) > 0 && op.Error.Errors[0] != nil {

pkg/gce-pd-csi-driver/node.go

+54-1
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,63 @@ func (ns *GCENodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePub
9999
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("VolumeCapability is invalid: %v", err))
100100
}
101101

102-
notMnt, err := ns.Mounter.Interface.IsLikelyNotMountPoint(targetPath)
102+
/*notMnt, err := ns.Mounter.Interface.IsLikelyNotMountPoint(targetPath)
103103
if err != nil && !os.IsNotExist(err) {
104104
return nil, status.Error(codes.Internal, fmt.Sprintf("cannot validate mount point: %s %v", targetPath, err))
105+
}*/
106+
107+
_, lserr := os.Lstat(targetPath)
108+
klog.Infof("lstat %s: %v", targetPath, lserr)
109+
if lserr != nil {
110+
if os.IsNotExist(lserr) {
111+
klog.Infof("lstat not exist error")
112+
}
113+
}
114+
115+
_, lserr = os.Stat(targetPath)
116+
klog.Infof("lstat %s: %v", targetPath, lserr)
117+
if lserr != nil {
118+
if os.IsNotExist(lserr) {
119+
klog.Infof("Stat not exist error")
120+
}
105121
}
122+
merr := os.MkdirAll(targetPath, 0750)
123+
klog.Infof("mkdir %s, %v", targetPath, merr)
124+
125+
_, lserr = os.Lstat(targetPath)
126+
klog.Infof("lstat %s: %v", targetPath, lserr)
127+
if lserr != nil {
128+
if os.IsNotExist(lserr) {
129+
klog.Infof("lstat not exist error")
130+
}
131+
}
132+
133+
_, lserr = os.Stat(targetPath)
134+
klog.Infof("lstat %s: %v", targetPath, lserr)
135+
if lserr != nil {
136+
if os.IsNotExist(lserr) {
137+
klog.Infof("Stat not exist error")
138+
}
139+
}
140+
141+
rerr := os.RemoveAll(targetPath)
142+
klog.Infof("remove it %v", rerr)
143+
144+
_, lserr = os.Lstat(targetPath)
145+
klog.Infof("lstat %s: %v", targetPath, lserr)
146+
if lserr != nil {
147+
if os.IsNotExist(lserr) {
148+
klog.Infof("lstat not exist error")
149+
}
150+
}
151+
152+
notMnt, err := ns.Mounter.Interface.IsLikelyNotMountPoint(targetPath)
153+
klog.Infof("ismountpoint %s: %v", targetPath, err)
154+
if err != nil && !os.IsNotExist(err) {
155+
klog.Errorf("ismountpoint not exist err")
156+
//return nil, status.Error(codes.Internal, fmt.Sprintf("cannot validate mount point: %s %v", targetPath, err))
157+
}
158+
106159
if !notMnt {
107160
// TODO(#95): check if mount is compatible. Return OK if it is, or appropriate error.
108161
/*

0 commit comments

Comments
 (0)