Skip to content

allow to specify wait time for attach disk operation #956

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions cmd/gce-pd-csi-driver/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,24 @@ import (
)

var (
cloudConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config")
endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint")
runControllerService = flag.Bool("run-controller-service", true, "If set to false then the CSI driver does not activate its controller service (default: true)")
runNodeService = flag.Bool("run-node-service", true, "If set to false then the CSI driver does not activate its node service (default: true)")
httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.")
metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
extraVolumeLabelsStr = flag.String("extra-labels", "", "Extra labels to attach to each PD created. It is a comma separated list of key value pairs like '<key1>=<value1>,<key2>=<value2>'. See https://cloud.google.com/compute/docs/labeling-resources for details")
version string
cloudConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config")
endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint")
runControllerService = flag.Bool("run-controller-service", true, "If set to false then the CSI driver does not activate its controller service (default: true)")
runNodeService = flag.Bool("run-node-service", true, "If set to false then the CSI driver does not activate its node service (default: true)")
httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.")
metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
extraVolumeLabelsStr = flag.String("extra-labels", "", "Extra labels to attach to each PD created. It is a comma separated list of key value pairs like '<key1>=<value1>,<key2>=<value2>'. See https://cloud.google.com/compute/docs/labeling-resources for details")
attachDiskBackoffDuration = flag.Duration("attach-disk-backoff-duration", 5*time.Second, "Duration for attachDisk backoff")
attachDiskBackoffFactor = flag.Float64("attach-disk-backoff-factor", 0.0, "Factor for attachDisk backoff")
attachDiskBackoffJitter = flag.Float64("attach-disk-backoff-jitter", 0.0, "Jitter for attachDisk backoff")
attachDiskBackoffSteps = flag.Int("attach-disk-backoff-steps", 24, "Steps for attachDisk backoff")
attachDiskBackoffCap = flag.Duration("attach-disk-backoff-cap", 0, "Cap for attachDisk backoff")
waitForOpBackoffDuration = flag.Duration("wait-op-backoff-duration", 3*time.Second, "Duration for wait for operation backoff")
waitForOpBackoffFactor = flag.Float64("wait-op-backoff-factor", 0.0, "Factor for wait for operation backoff")
waitForOpBackoffJitter = flag.Float64("wait-op-backoff-jitter", 0.0, "Jitter for wait for operation backoff")
waitForOpBackoffSteps = flag.Int("wait-op-backoff-steps", 100, "Steps for wait for operation backoff")
waitForOpBackoffCap = flag.Duration("wait-op-backoff-cap", 0, "Cap for wait for operation backoff")
version string
)

const (
Expand Down Expand Up @@ -128,5 +138,17 @@ func handle() {
klog.Fatalf("Failed to initialize GCE CSI Driver: %v", err)
}

gce.AttachDiskBackoff.Duration = *attachDiskBackoffDuration
gce.AttachDiskBackoff.Factor = *attachDiskBackoffFactor
gce.AttachDiskBackoff.Jitter = *attachDiskBackoffJitter
gce.AttachDiskBackoff.Steps = *attachDiskBackoffSteps
gce.AttachDiskBackoff.Cap = *attachDiskBackoffCap

gce.WaitForOpBackoff.Duration = *waitForOpBackoffDuration
gce.WaitForOpBackoff.Factor = *waitForOpBackoffFactor
gce.WaitForOpBackoff.Jitter = *waitForOpBackoffJitter
gce.WaitForOpBackoff.Steps = *waitForOpBackoffSteps
gce.WaitForOpBackoff.Cap = *waitForOpBackoffCap

gceDriver.Run(*endpoint)
}
26 changes: 22 additions & 4 deletions pkg/gce-cloud-provider/compute/gce-compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,24 @@ const (
GCEAPIVersionBeta GCEAPIVersion = "beta"
)

// AttachDiskBackoff is backoff used to wait for AttachDisk to complete.
// Default values are similar to Poll every 5 seconds with 2 minute timeout.
var AttachDiskBackoff = wait.Backoff{
Duration: 5 * time.Second,
Factor: 0.0,
Jitter: 0.0,
Steps: 24,
Cap: 0}

// WaitForOpBackoff is backoff used to wait for Global, Regional or Zonal operation to complete.
// Default values are similar to Poll every 3 seconds with 5 minute timeout.
var WaitForOpBackoff = wait.Backoff{
Duration: 3 * time.Second,
Factor: 0.0,
Jitter: 0.0,
Steps: 100,
Cap: 0}

type GCECompute interface {
// Metadata information
GetDefaultProject() string
Expand Down Expand Up @@ -739,7 +757,7 @@ func (cloud *CloudProvider) getRegionalDiskTypeURI(project string, region, diskT

func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, project, opName string, zone string) error {
// The v1 API can query for v1, alpha, or beta operations.
return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) {
return wait.ExponentialBackoff(WaitForOpBackoff, func() (bool, error) {
pollOp, err := cloud.service.ZoneOperations.Get(project, zone, opName).Context(ctx).Do()
if err != nil {
klog.Errorf("WaitForOp(op: %s, zone: %#v) failed to poll the operation", opName, zone)
Expand All @@ -752,7 +770,7 @@ func (cloud *CloudProvider) waitForZonalOp(ctx context.Context, project, opName

func (cloud *CloudProvider) waitForRegionalOp(ctx context.Context, project, opName string, region string) error {
// The v1 API can query for v1, alpha, or beta operations.
return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) {
return wait.ExponentialBackoff(WaitForOpBackoff, func() (bool, error) {
pollOp, err := cloud.service.RegionOperations.Get(project, region, opName).Context(ctx).Do()
if err != nil {
klog.Errorf("WaitForOp(op: %s, region: %#v) failed to poll the operation", opName, region)
Expand All @@ -764,7 +782,7 @@ func (cloud *CloudProvider) waitForRegionalOp(ctx context.Context, project, opNa
}

func (cloud *CloudProvider) waitForGlobalOp(ctx context.Context, project, opName string) error {
return wait.Poll(3*time.Second, 5*time.Minute, func() (bool, error) {
return wait.ExponentialBackoff(WaitForOpBackoff, func() (bool, error) {
pollOp, err := cloud.service.GlobalOperations.Get(project, opName).Context(ctx).Do()
if err != nil {
klog.Errorf("waitForGlobalOp(op: %s) failed to poll the operation", opName)
Expand All @@ -778,7 +796,7 @@ func (cloud *CloudProvider) waitForGlobalOp(ctx context.Context, project, opName
func (cloud *CloudProvider) WaitForAttach(ctx context.Context, project string, volKey *meta.Key, instanceZone, instanceName string) error {
klog.V(5).Infof("Waiting for attach of disk %v to instance %v to complete...", volKey.Name, instanceName)
start := time.Now()
return wait.Poll(5*time.Second, 2*time.Minute, func() (bool, error) {
return wait.ExponentialBackoff(AttachDiskBackoff, func() (bool, error) {
klog.V(6).Infof("Polling for attach of disk %v to instance %v to complete for %v", volKey.Name, instanceName, time.Since(start))
disk, err := cloud.GetDisk(ctx, project, volKey, GCEAPIVersionV1)
if err != nil {
Expand Down