Skip to content

Commit 9521b44

Browse files
committed
Use original error code when backing off
Change-Id: If514a26bf513fc47de6e74615f352a83268f5ecf
1 parent d14b457 commit 9521b44

File tree

2 files changed

+63
-22
lines changed

2 files changed

+63
-22
lines changed

pkg/gce-pd-csi-driver/controller.go

+29-10
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,12 @@ type GCEControllerServer struct {
9292
errorBackoff *csiErrorBackoff
9393
}
9494

95+
type csiErrorBackoffId string
96+
9597
type csiErrorBackoff struct {
96-
backoff *flowcontrol.Backoff
98+
backoff *flowcontrol.Backoff
99+
errorCodes map[csiErrorBackoffId]codes.Code
97100
}
98-
type csiErrorBackoffId string
99101

100102
type workItem struct {
101103
ctx context.Context
@@ -496,13 +498,13 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r
496498

497499
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
498500
if gceCS.errorBackoff.blocking(backoffId) {
499-
return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
501+
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
500502
}
501503

502504
resp, err, diskTypeForMetric := gceCS.executeControllerPublishVolume(ctx, req)
503505
if err != nil {
504-
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err.Error())
505-
gceCS.errorBackoff.next(backoffId)
506+
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
507+
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
506508
} else {
507509
klog.Infof("For node %s clear backoff due to successful publish of volume %v", req.NodeId, req.VolumeId)
508510
gceCS.errorBackoff.reset(backoffId)
@@ -661,12 +663,12 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,
661663
// Only valid requests will be queued
662664
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
663665
if gceCS.errorBackoff.blocking(backoffId) {
664-
return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
666+
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
665667
}
666668
resp, err, diskTypeForMetric := gceCS.executeControllerUnpublishVolume(ctx, req)
667669
if err != nil {
668-
klog.Infof("For node %s adding backoff due to error for volume %s", req.NodeId, req.VolumeId)
669-
gceCS.errorBackoff.next(backoffId)
670+
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
671+
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
670672
} else {
671673
klog.Infof("For node %s clear backoff due to successful unpublish of volume %v", req.NodeId, req.VolumeId)
672674
gceCS.errorBackoff.reset(backoffId)
@@ -1816,7 +1818,7 @@ func pickRandAndConsecutive(slice []string, n int) ([]string, error) {
18161818
}
18171819

18181820
func newCsiErrorBackoff(initialDuration, errorBackoffMaxDuration time.Duration) *csiErrorBackoff {
1819-
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration)}
1821+
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration), make(map[csiErrorBackoffId]codes.Code)}
18201822
}
18211823

18221824
func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId {
@@ -1828,10 +1830,27 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool {
18281830
return blk
18291831
}
18301832

1831-
func (b *csiErrorBackoff) next(id csiErrorBackoffId) {
1833+
func (b *csiErrorBackoff) code(id csiErrorBackoffId) codes.Code {
1834+
if code, ok := b.errorCodes[id]; ok {
1835+
return code
1836+
}
1837+
// If we haven't recorded a code, return unavailable, which signals a problem with the driver
1838+
// (ie, it's not filtered out as a user problem).
1839+
klog.Errorf("using default code for %s", id)
1840+
return codes.Unavailable
1841+
}
1842+
1843+
func (b *csiErrorBackoff) next(id csiErrorBackoffId, code *codes.Code) {
18321844
b.backoff.Next(string(id), b.backoff.Clock.Now())
1845+
if code != nil {
1846+
b.errorCodes[id] = *code
1847+
} else {
1848+
// Use unavailable, which is our default for a backoff error.
1849+
b.errorCodes[id] = codes.Unavailable
1850+
}
18331851
}
18341852

18351853
func (b *csiErrorBackoff) reset(id csiErrorBackoffId) {
18361854
b.backoff.Reset(string(id))
1855+
delete(b.errorCodes, id)
18371856
}

pkg/gce-pd-csi-driver/controller_test.go

+34-12
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@ func TestDeleteSnapshot(t *testing.T) {
257257
_, err := gceDriver.cs.DeleteSnapshot(context.Background(), tc.req)
258258
if err != nil {
259259
serverError, ok := status.FromError(err)
260-
t.Logf("get server error %v", serverError)
261260
if !ok {
262261
t.Fatalf("Could not get error status code from err: %v", serverError)
263262
}
@@ -2970,7 +2969,7 @@ type backoffDriverConfig struct {
29702969

29712970
func newFakeCSIErrorBackoff(tc *clock.FakeClock) *csiErrorBackoff {
29722971
backoff := flowcontrol.NewFakeBackOff(errorBackoffInitialDuration, errorBackoffMaxDuration, tc)
2973-
return &csiErrorBackoff{backoff}
2972+
return &csiErrorBackoff{backoff, make(map[csiErrorBackoffId]codes.Code)}
29742973
}
29752974

29762975
func TestControllerUnpublishBackoff(t *testing.T) {
@@ -3010,7 +3009,8 @@ func TestControllerUnpublishBackoff(t *testing.T) {
30103009
}
30113010

30123011
// Mock an active backoff condition on the node.
3013-
driver.cs.errorBackoff.next(backoffId)
3012+
errorCode := codes.Unavailable
3013+
driver.cs.errorBackoff.next(backoffId, &errorCode)
30143014

30153015
tc.config.clock.Step(step)
30163016
// A requst for a a different volume should succeed. This volume is not
@@ -3028,7 +3028,9 @@ func TestControllerUnpublishBackoff(t *testing.T) {
30283028
VolumeId: testVolumeID,
30293029
NodeId: testNodeID,
30303030
}
3031-
// For the first 199 ms, the backoff condition is true. All controller publish request will be denied with 'Unavailable' error code.
3031+
// For the first 199 ms, the backoff condition is true. All controller publish
3032+
// request will be denied with the same unavailable error code as was set on
3033+
// the original error.
30323034
for i := 0; i < 199; i++ {
30333035
var err error
30343036
_, err = driver.cs.ControllerUnpublishVolume(context.Background(), unpubreq)
@@ -3057,13 +3059,16 @@ func TestControllerUnpublishBackoff(t *testing.T) {
30573059
t.Errorf("expected error")
30583060
}
30593061

3060-
// The above failure should cause driver to call Backoff.Next() again and a backoff duration of 400 ms duration is set starting at the 200th millisecond.
3061-
// For the 200-599 ms, the backoff condition is true, and new controller publish requests will be deined.
3062+
// The above failure should cause driver to call backoff.next() again and a
3063+
// backoff duration of 400 ms duration is set starting at the 200th
3064+
// millisecond. For the 200-599 ms, the backoff condition is true, with an
3065+
// internal error this time, and new controller publish requests will be
3066+
// deined.
30623067
for i := 0; i < 399; i++ {
30633068
tc.config.clock.Step(step)
30643069
var err error
30653070
_, err = driver.cs.ControllerUnpublishVolume(context.Background(), unpubreq)
3066-
if !isUnavailableError(err) {
3071+
if !isInternalError(err) {
30673072
t.Errorf("unexpected error %v", err)
30683073
}
30693074
}
@@ -3139,8 +3144,9 @@ func TestControllerPublishBackoff(t *testing.T) {
31393144
backoffId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID)
31403145
step := 1 * time.Millisecond
31413146

3142-
// Mock an active backoff condition on the node.
3143-
driver.cs.errorBackoff.next(backoffId)
3147+
// Mock an active bakcoff condition on the node.
3148+
errorCode := codes.Unavailable
3149+
driver.cs.errorBackoff.next(backoffId, &errorCode)
31443150

31453151
// A detach request for a different disk should succeed. As this disk is not
31463152
// on the instance, the detach will succeed without calling the gce detach
@@ -3213,13 +3219,16 @@ func TestControllerPublishBackoff(t *testing.T) {
32133219
t.Errorf("expected error")
32143220
}
32153221

3216-
// The above failure should cause driver to call Backoff.Next() again and a backoff duration of 400 ms duration is set starting at the 200th millisecond.
3217-
// For the 200-599 ms, the backoff condition is true, and new controller publish requests will be deined.
3222+
// The above failure should cause driver to call backoff.next() again and a
3223+
// backoff duration of 400 ms duration is set starting at the 200th
3224+
// millisecond. For the 200-599 ms, the backoff condition is true, with an
3225+
// internal error this time, and new controller publish requests will be
3226+
// deined.
32183227
for i := 0; i < 399; i++ {
32193228
tc.config.clock.Step(step)
32203229
var err error
32213230
_, err = driver.cs.ControllerPublishVolume(context.Background(), pubreq)
3222-
if !isUnavailableError(err) {
3231+
if !isInternalError(err) {
32233232
t.Errorf("unexpected error %v", err)
32243233
}
32253234
}
@@ -3363,3 +3372,16 @@ func isUnavailableError(err error) bool {
33633372

33643373
return st.Code().String() == "Unavailable"
33653374
}
3375+
3376+
func isInternalError(err error) bool {
3377+
if err == nil {
3378+
return false
3379+
}
3380+
3381+
st, ok := status.FromError(err)
3382+
if !ok {
3383+
return false
3384+
}
3385+
3386+
return st.Code().String() == "Internal"
3387+
}

0 commit comments

Comments
 (0)