diff --git a/cmd/gce-pd-csi-driver/main.go b/cmd/gce-pd-csi-driver/main.go index 18cdce59b..96e75d415 100644 --- a/cmd/gce-pd-csi-driver/main.go +++ b/cmd/gce-pd-csi-driver/main.go @@ -33,14 +33,24 @@ import ( ) var ( - cloudConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config") - endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint") - runControllerService = flag.Bool("run-controller-service", true, "If set to false then the CSI driver does not activate its controller service (default: true)") - runNodeService = flag.Bool("run-node-service", true, "If set to false then the CSI driver does not activate its node service (default: true)") - httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.") - metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") - extraVolumeLabelsStr = flag.String("extra-labels", "", "Extra labels to attach to each PD created. It is a comma separated list of key value pairs like '=,='. See https://cloud.google.com/compute/docs/labeling-resources for details") - version string + cloudConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config") + endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint") + runControllerService = flag.Bool("run-controller-service", true, "If set to false then the CSI driver does not activate its controller service (default: true)") + runNodeService = flag.Bool("run-node-service", true, "If set to false then the CSI driver does not activate its node service (default: true)") + httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.") + metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") + extraVolumeLabelsStr = flag.String("extra-labels", "", "Extra labels to attach to each PD created. It is a comma separated list of key value pairs like '=,='. See https://cloud.google.com/compute/docs/labeling-resources for details") + attachDiskBackoffDuration = flag.Duration("attach-disk-backoff-duration", 5*time.Second, "Duration for attachDisk backoff") + attachDiskBackoffFactor = flag.Float64("attach-disk-backoff-factor", 0.0, "Factor for attachDisk backoff") + attachDiskBackoffJitter = flag.Float64("attach-disk-backoff-jitter", 0.0, "Jitter for attachDisk backoff") + attachDiskBackoffSteps = flag.Int("attach-disk-backoff-steps", 24, "Steps for attachDisk backoff") + attachDiskBackoffCap = flag.Duration("attach-disk-backoff-cap", 0, "Cap for attachDisk backoff") + waitForOpBackoffDuration = flag.Duration("wait-op-backoff-duration", 3*time.Second, "Duration for wait for operation backoff") + waitForOpBackoffFactor = flag.Float64("wait-op-backoff-factor", 0.0, "Factor for wait for operation backoff") + waitForOpBackoffJitter = flag.Float64("wait-op-backoff-jitter", 0.0, "Jitter for wait for operation backoff") + waitForOpBackoffSteps = flag.Int("wait-op-backoff-steps", 100, "Steps for wait for operation backoff") + waitForOpBackoffCap = flag.Duration("wait-op-backoff-cap", 0, "Cap for wait for operation backoff") + version string ) const ( @@ -128,5 +138,17 @@ func handle() { klog.Fatalf("Failed to initialize GCE CSI Driver: %v", err) } + gce.AttachDiskBackoff.Duration = *attachDiskBackoffDuration + gce.AttachDiskBackoff.Factor = *attachDiskBackoffFactor + gce.AttachDiskBackoff.Jitter = *attachDiskBackoffJitter + gce.AttachDiskBackoff.Steps = *attachDiskBackoffSteps + gce.AttachDiskBackoff.Cap = *attachDiskBackoffCap + + gce.WaitForOpBackoff.Duration = *waitForOpBackoffDuration + gce.WaitForOpBackoff.Factor = *waitForOpBackoffFactor + gce.WaitForOpBackoff.Jitter = *waitForOpBackoffJitter + gce.WaitForOpBackoff.Steps = *waitForOpBackoffSteps + gce.WaitForOpBackoff.Cap = *waitForOpBackoffCap + gceDriver.Run(*endpoint) } diff --git a/pkg/gce-cloud-provider/compute/gce-compute.go b/pkg/gce-cloud-provider/compute/gce-compute.go index cd9235952..1bce9177a 100644 --- a/pkg/gce-cloud-provider/compute/gce-compute.go +++ b/pkg/gce-cloud-provider/compute/gce-compute.go @@ -49,6 +49,24 @@ const ( GCEAPIVersionBeta GCEAPIVersion = "beta" ) +// AttachDiskBackoff is backoff used to wait for AttachDisk to complete. +// Default values are similar to Poll every 5 seconds with 2 minute timeout. +var AttachDiskBackoff = wait.Backoff{ + Duration: 5 * time.Second, + Factor: 0.0, + Jitter: 0.0, + Steps: 24, + Cap: 0} + +// WaitForOpBackoff is backoff used to wait for Global, Regional or Zonal operation to complete. +// Default values are similar to Poll every 3 seconds with 5 minute timeout. +var WaitForOpBackoff = wait.Backoff{ + Duration: 3 * time.Second, + Factor: 0.0, + Jitter: 0.0, + Steps: 100, + Cap: 0} + type GCECompute interface { // Metadata information GetDefaultProject() string @@ -739,7 +757,7 @@ func (cloud *CloudProvider) getRegionalDiskTypeURI(project string, region, diskT func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, project, opName string, zone string) error { // The v1 API can query for v1, alpha, or beta operations. - return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) { + return wait.ExponentialBackoff(WaitForOpBackoff, func() (bool, error) { pollOp, err := cloud.service.ZoneOperations.Get(project, zone, opName).Context(ctx).Do() if err != nil { klog.Errorf("WaitForOp(op: %s, zone: %#v) failed to poll the operation", opName, zone) @@ -752,7 +770,7 @@ func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, project, opName func (cloud *CloudProvider) waitForRegionalOp(ctx context.Context, project, opName string, region string) error { // The v1 API can query for v1, alpha, or beta operations. - return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) { + return wait.ExponentialBackoff(WaitForOpBackoff, func() (bool, error) { pollOp, err := cloud.service.RegionOperations.Get(project, region, opName).Context(ctx).Do() if err != nil { klog.Errorf("WaitForOp(op: %s, region: %#v) failed to poll the operation", opName, region) @@ -764,7 +782,7 @@ func (cloud *CloudProvider) waitForRegionalOp(ctx context.Context, project, opNa } func (cloud *CloudProvider) waitForGlobalOp(ctx context.Context, project, opName string) error { - return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) { + return wait.ExponentialBackoff(WaitForOpBackoff, func() (bool, error) { pollOp, err := cloud.service.GlobalOperations.Get(project, opName).Context(ctx).Do() if err != nil { klog.Errorf("waitForGlobalOp(op: %s) failed to poll the operation", opName) @@ -778,7 +796,7 @@ func (cloud *CloudProvider) waitForGlobalOp(ctx context.Context, project, opName func (cloud *CloudProvider) WaitForAttach(ctx context.Context, project string, volKey *meta.Key, instanceZone, instanceName string) error { klog.V(5).Infof("Waiting for attach of disk %v to instance %v to complete...", volKey.Name, instanceName) start := time.Now() - return wait.Poll(5*time.Second, 2*time.Minute, func() (bool, error) { + return wait.ExponentialBackoff(AttachDiskBackoff, func() (bool, error) { klog.V(6).Infof("Polling for attach of disk %v to instance %v to complete for %v", volKey.Name, instanceName, time.Since(start)) disk, err := cloud.GetDisk(ctx, project, volKey, GCEAPIVersionV1) if err != nil {