@@ -30,6 +30,8 @@ import (
30
30
"google.golang.org/grpc/status"
31
31
"k8s.io/apimachinery/pkg/util/sets"
32
32
"k8s.io/apimachinery/pkg/util/uuid"
33
+ "k8s.io/apimachinery/pkg/util/wait"
34
+ "k8s.io/client-go/util/workqueue"
33
35
"k8s.io/klog"
34
36
35
37
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
@@ -47,6 +49,22 @@ type GCEControllerServer struct {
47
49
// operations for that same volume (as defined by Volume Key) return an
48
50
// Aborted error
49
51
volumeLocks * common.VolumeLocks
52
+
53
+ // queue is a rate limited work queue for Controller Publish/Unpublish
54
+ // Volume calls
55
+ queue workqueue.RateLimitingInterface
56
+
57
+ // publishErrorsSeenOnNode is a list of nodes with attach/detach
58
+ // operation failures so those nodes shall be rate limited for all
59
+ // the attach/detach operations until there is an attach / detach
60
+ // operation succeeds
61
+ publishErrorsSeenOnNode map [string ]bool
62
+ }
63
+
64
+ type workItem struct {
65
+ ctx context.Context
66
+ publishReq * csi.ControllerPublishVolumeRequest
67
+ unpublishReq * csi.ControllerUnpublishVolumeRequest
50
68
}
51
69
52
70
var _ csi.ControllerServer = & GCEControllerServer {}
@@ -280,25 +298,113 @@ func (gceCS *GCEControllerServer) DeleteVolume(ctx context.Context, req *csi.Del
280
298
return & csi.DeleteVolumeResponse {}, nil
281
299
}
282
300
301
+ // Run starts the GCEControllerServer.
302
+ func (gceCS * GCEControllerServer ) Run () {
303
+ go wait .Until (gceCS .worker , 1 * time .Second , wait .NeverStop )
304
+ }
305
+
306
+ func (gceCS * GCEControllerServer ) worker () {
307
+ // Runs until workqueue is shut down
308
+ for gceCS .processNextWorkItem () {
309
+ }
310
+ }
311
+
312
+ func (gceCS * GCEControllerServer ) processNextWorkItem () bool {
313
+ item , quit := gceCS .queue .Get ()
314
+ if quit {
315
+ return false
316
+ }
317
+ defer gceCS .queue .Done (item )
318
+
319
+ workItem , ok := item .(* workItem )
320
+ if ! ok {
321
+ gceCS .queue .AddRateLimited (item )
322
+ return true
323
+ }
324
+
325
+ if workItem .publishReq != nil {
326
+ _ , err := gceCS .executeControllerPublishVolume (workItem .ctx , workItem .publishReq )
327
+
328
+ if err != nil {
329
+ klog .Errorf ("ControllerPublishVolume failed with error: %v" , err )
330
+ }
331
+ }
332
+
333
+ if workItem .unpublishReq != nil {
334
+ _ , err := gceCS .executeControllerUnpublishVolume (workItem .ctx , workItem .unpublishReq )
335
+
336
+ if err != nil {
337
+ klog .Errorf ("ControllerUnpublishVolume failed with error: %v" , err )
338
+ }
339
+ }
340
+
341
+ gceCS .queue .Forget (item )
342
+ return true
343
+ }
344
+
283
345
func (gceCS * GCEControllerServer ) ControllerPublishVolume (ctx context.Context , req * csi.ControllerPublishVolumeRequest ) (* csi.ControllerPublishVolumeResponse , error ) {
346
+ // Only valid requests will be queued
347
+ _ , _ , err := gceCS .validateControllerPublishVolumeRequest (ctx , req )
348
+
349
+ if err != nil {
350
+ return nil , err
351
+ }
352
+
353
+ // If the node is not marked, proceed the request
354
+ if _ , found := gceCS .publishErrorsSeenOnNode [req .NodeId ]; ! found {
355
+ return gceCS .executeControllerPublishVolume (ctx , req )
356
+ }
357
+
358
+ // Node is marked so queue up the request
359
+ gceCS .queue .AddRateLimited (& workItem {
360
+ ctx : ctx ,
361
+ publishReq : req ,
362
+ })
363
+ return & csi.ControllerPublishVolumeResponse {}, nil
364
+ }
365
+
366
+ func (gceCS * GCEControllerServer ) validateControllerPublishVolumeRequest (ctx context.Context , req * csi.ControllerPublishVolumeRequest ) (string , * meta.Key , error ) {
284
367
// Validate arguments
285
368
volumeID := req .GetVolumeId ()
286
- readOnly := req .GetReadonly ()
287
369
nodeID := req .GetNodeId ()
288
370
volumeCapability := req .GetVolumeCapability ()
289
371
if len (volumeID ) == 0 {
290
- return nil , status .Error (codes .InvalidArgument , "ControllerPublishVolume Volume ID must be provided" )
372
+ return "" , nil , status .Error (codes .InvalidArgument , "ControllerPublishVolume Volume ID must be provided" )
291
373
}
292
374
if len (nodeID ) == 0 {
293
- return nil , status .Error (codes .InvalidArgument , "ControllerPublishVolume Node ID must be provided" )
375
+ return "" , nil , status .Error (codes .InvalidArgument , "ControllerPublishVolume Node ID must be provided" )
294
376
}
295
377
if volumeCapability == nil {
296
- return nil , status .Error (codes .InvalidArgument , "ControllerPublishVolume Volume capability must be provided" )
378
+ return "" , nil , status .Error (codes .InvalidArgument , "ControllerPublishVolume Volume capability must be provided" )
297
379
}
298
380
299
381
project , volKey , err := common .VolumeIDToKey (volumeID )
300
382
if err != nil {
301
- return nil , status .Error (codes .InvalidArgument , fmt .Sprintf ("ControllerPublishVolume volume ID is invalid: %v" , err ))
383
+ return "" , nil , status .Error (codes .InvalidArgument , fmt .Sprintf ("ControllerPublishVolume volume ID is invalid: %v" , err ))
384
+ }
385
+
386
+ // TODO(#253): Check volume capability matches for ALREADY_EXISTS
387
+ if err = validateVolumeCapability (volumeCapability ); err != nil {
388
+ return "" , nil , status .Error (codes .InvalidArgument , fmt .Sprintf ("VolumeCapabilities is invalid: %v" , err ))
389
+ }
390
+
391
+ return project , volKey , nil
392
+ }
393
+
394
+ func (gceCS * GCEControllerServer ) executeControllerPublishVolume (ctx context.Context , req * csi.ControllerPublishVolumeRequest ) (* csi.ControllerPublishVolumeResponse , error ) {
395
+ project , volKey , err := gceCS .validateControllerPublishVolumeRequest (ctx , req )
396
+
397
+ if err != nil {
398
+ return nil , err
399
+ }
400
+
401
+ volumeID := req .GetVolumeId ()
402
+ readOnly := req .GetReadonly ()
403
+ nodeID := req .GetNodeId ()
404
+ volumeCapability := req .GetVolumeCapability ()
405
+
406
+ pubVolResp := & csi.ControllerPublishVolumeResponse {
407
+ PublishContext : nil ,
302
408
}
303
409
304
410
project , volKey , err = gceCS .CloudProvider .RepairUnderspecifiedVolumeKey (ctx , project , volKey )
@@ -317,15 +423,6 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r
317
423
}
318
424
defer gceCS .volumeLocks .Release (lockingVolumeID )
319
425
320
- // TODO(#253): Check volume capability matches for ALREADY_EXISTS
321
- if err = validateVolumeCapability (volumeCapability ); err != nil {
322
- return nil , status .Error (codes .InvalidArgument , fmt .Sprintf ("VolumeCapabilities is invalid: %v" , err ))
323
- }
324
-
325
- pubVolResp := & csi.ControllerPublishVolumeResponse {
326
- PublishContext : nil ,
327
- }
328
-
329
426
_ , err = gceCS .CloudProvider .GetDisk (ctx , project , volKey , gce .GCEAPIVersionV1 )
330
427
if err != nil {
331
428
if gce .IsGCENotFoundError (err ) {
@@ -375,29 +472,69 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r
375
472
376
473
err = gceCS .CloudProvider .WaitForAttach (ctx , project , volKey , instanceZone , instanceName )
377
474
if err != nil {
475
+ // Mark the node and rate limit all the following attach/detach
476
+ // operations for this node
477
+ gceCS .publishErrorsSeenOnNode [nodeID ] = true
378
478
return nil , status .Error (codes .Internal , fmt .Sprintf ("unknown WaitForAttach error: %v" , err ))
379
479
}
380
480
481
+ // Attach succeeds so unmark the node
482
+ delete (gceCS .publishErrorsSeenOnNode , nodeID )
483
+
381
484
klog .V (4 ).Infof ("ControllerPublishVolume succeeded for disk %v to instance %v" , volKey , nodeID )
382
485
return pubVolResp , nil
383
486
}
384
487
385
488
func (gceCS * GCEControllerServer ) ControllerUnpublishVolume (ctx context.Context , req * csi.ControllerUnpublishVolumeRequest ) (* csi.ControllerUnpublishVolumeResponse , error ) {
489
+ // Only valid requests will be queued
490
+ _ , _ , err := gceCS .validateControllerUnpublishVolumeRequest (ctx , req )
491
+
492
+ if err != nil {
493
+ return nil , err
494
+ }
495
+
496
+ // If the node is not marked, proceed the request
497
+ if _ , found := gceCS .publishErrorsSeenOnNode [req .NodeId ]; ! found {
498
+ return gceCS .executeControllerUnpublishVolume (ctx , req )
499
+ }
500
+
501
+ // Node is marked so queue up the request
502
+ gceCS .queue .AddRateLimited (& workItem {
503
+ ctx : ctx ,
504
+ unpublishReq : req ,
505
+ })
506
+
507
+ return & csi.ControllerUnpublishVolumeResponse {}, nil
508
+ }
509
+
510
+ func (gceCS * GCEControllerServer ) validateControllerUnpublishVolumeRequest (ctx context.Context , req * csi.ControllerUnpublishVolumeRequest ) (string , * meta.Key , error ) {
386
511
// Validate arguments
387
512
volumeID := req .GetVolumeId ()
388
513
nodeID := req .GetNodeId ()
389
514
if len (volumeID ) == 0 {
390
- return nil , status .Error (codes .InvalidArgument , "ControllerUnpublishVolume Volume ID must be provided" )
515
+ return "" , nil , status .Error (codes .InvalidArgument , "ControllerUnpublishVolume Volume ID must be provided" )
391
516
}
392
517
if len (nodeID ) == 0 {
393
- return nil , status .Error (codes .InvalidArgument , "ControllerUnpublishVolume Node ID must be provided" )
518
+ return "" , nil , status .Error (codes .InvalidArgument , "ControllerUnpublishVolume Node ID must be provided" )
394
519
}
395
520
396
521
project , volKey , err := common .VolumeIDToKey (volumeID )
397
522
if err != nil {
398
- return nil , status .Error (codes .InvalidArgument , fmt .Sprintf ("ControllerUnpublishVolume Volume ID is invalid: %v" , err ))
523
+ return "" , nil , status .Error (codes .InvalidArgument , fmt .Sprintf ("ControllerUnpublishVolume Volume ID is invalid: %v" , err ))
399
524
}
400
525
526
+ return project , volKey , nil
527
+ }
528
+
529
+ func (gceCS * GCEControllerServer ) executeControllerUnpublishVolume (ctx context.Context , req * csi.ControllerUnpublishVolumeRequest ) (* csi.ControllerUnpublishVolumeResponse , error ) {
530
+ project , volKey , err := gceCS .validateControllerUnpublishVolumeRequest (ctx , req )
531
+
532
+ if err != nil {
533
+ return nil , err
534
+ }
535
+
536
+ volumeID := req .GetVolumeId ()
537
+ nodeID := req .GetNodeId ()
401
538
project , volKey , err = gceCS .CloudProvider .RepairUnderspecifiedVolumeKey (ctx , project , volKey )
402
539
if err != nil {
403
540
if gce .IsGCENotFoundError (err ) {
@@ -443,9 +580,15 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,
443
580
444
581
err = gceCS .CloudProvider .DetachDisk (ctx , project , deviceName , instanceZone , instanceName )
445
582
if err != nil {
583
+ // Mark the node and rate limit all the following attach/detach
584
+ // operations for this node
585
+ gceCS .publishErrorsSeenOnNode [nodeID ] = true
446
586
return nil , status .Error (codes .Internal , fmt .Sprintf ("unknown detach error: %v" , err ))
447
587
}
448
588
589
+ // Detach succeeds so unmark the node
590
+ delete (gceCS .publishErrorsSeenOnNode , nodeID )
591
+
449
592
klog .V (4 ).Infof ("ControllerUnpublishVolume succeeded for disk %v from node %v" , volKey , nodeID )
450
593
return & csi.ControllerUnpublishVolumeResponse {}, nil
451
594
}
0 commit comments