Skip to content

Commit fc0265b

Browse files
authored
Merge pull request #1312 from k8s-infra-cherrypick-robot/cherry-pick-1298-to-release-1.9
[release-1.9] Use original error code when backing off
2 parents 74d094f + 26ed78b commit fc0265b

File tree

4 files changed

+123
-102
lines changed

4 files changed

+123
-102
lines changed

pkg/common/utils.go

+37-48
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ var (
7979
// Full or partial URL of the machine type resource, in the format:
8080
// zones/zone/machineTypes/machine-type
8181
machineTypeRegex = regexp.MustCompile(machineTypePattern)
82+
83+
// userErrorCodeMap tells how API error types are translated to error codes.
84+
userErrorCodeMap = map[int]codes.Code{
85+
http.StatusForbidden: codes.PermissionDenied,
86+
http.StatusBadRequest: codes.InvalidArgument,
87+
http.StatusTooManyRequests: codes.ResourceExhausted,
88+
http.StatusNotFound: codes.NotFound,
89+
}
8290
)
8391

8492
func BytesToGbRoundDown(bytes int64) int64 {
@@ -296,82 +304,63 @@ func ParseMachineType(machineTypeUrl string) (string, error) {
296304
return machineType[1], nil
297305
}
298306

299-
// CodeForError returns a pointer to the grpc error code that maps to the http
300-
// error code for the passed in user googleapi error or context error. Returns
301-
// codes.Internal if the given error is not a googleapi error caused by the user.
302-
// The following http error codes are considered user errors:
303-
// (1) http 400 Bad Request, returns grpc InvalidArgument,
304-
// (2) http 403 Forbidden, returns grpc PermissionDenied,
305-
// (3) http 404 Not Found, returns grpc NotFound
306-
// (4) http 429 Too Many Requests, returns grpc ResourceExhausted
307-
// The following errors are considered context errors:
308-
// (1) "context deadline exceeded", returns grpc DeadlineExceeded,
309-
// (2) "context canceled", returns grpc Canceled
310-
func CodeForError(err error) *codes.Code {
311-
if err == nil {
312-
return nil
307+
// CodeForError returns the grpc error code that maps to the http error code for the
308+
// passed in user googleapi error or context error. Returns codes.Internal if the given
309+
// error is not a googleapi error caused by the user. userErrorCodeMap is used for
310+
// encoding most errors.
311+
func CodeForError(sourceError error) codes.Code {
312+
if sourceError == nil {
313+
return codes.Internal
313314
}
314315

315-
if errCode := existingErrorCode(err); errCode != nil {
316-
return errCode
316+
if code, err := existingErrorCode(sourceError); err == nil {
317+
return code
317318
}
318-
if code := isContextError(err); code != nil {
319+
if code, err := isContextError(sourceError); err == nil {
319320
return code
320321
}
321322

322-
internalErrorCode := codes.Internal
323-
// Upwrap the error
324323
var apiErr *googleapi.Error
325-
if !errors.As(err, &apiErr) {
326-
return &internalErrorCode
327-
}
328-
329-
userErrors := map[int]codes.Code{
330-
http.StatusForbidden: codes.PermissionDenied,
331-
http.StatusBadRequest: codes.InvalidArgument,
332-
http.StatusTooManyRequests: codes.ResourceExhausted,
333-
http.StatusNotFound: codes.NotFound,
324+
if !errors.As(sourceError, &apiErr) {
325+
return codes.Internal
334326
}
335-
if code, ok := userErrors[apiErr.Code]; ok {
336-
return &code
327+
if code, ok := userErrorCodeMap[apiErr.Code]; ok {
328+
return code
337329
}
338330

339-
return &internalErrorCode
331+
return codes.Internal
340332
}
341333

342-
// isContextError returns a pointer to the grpc error code DeadlineExceeded
343-
// if the passed in error contains the "context deadline exceeded" string and returns
344-
// the grpc error code Canceled if the error contains the "context canceled" string.
345-
func isContextError(err error) *codes.Code {
334+
// isContextError returns the grpc error code DeadlineExceeded if the passed in error
335+
// contains the "context deadline exceeded" string and returns the grpc error code
336+
// Canceled if the error contains the "context canceled" string. It returns and error if
337+
// err isn't a context error.
338+
func isContextError(err error) (codes.Code, error) {
346339
if err == nil {
347-
return nil
340+
return codes.Unknown, fmt.Errorf("null error")
348341
}
349342

350343
errStr := err.Error()
351344
if strings.Contains(errStr, context.DeadlineExceeded.Error()) {
352-
return errCodePtr(codes.DeadlineExceeded)
345+
return codes.DeadlineExceeded, nil
353346
}
354347
if strings.Contains(errStr, context.Canceled.Error()) {
355-
return errCodePtr(codes.Canceled)
348+
return codes.Canceled, nil
356349
}
357-
return nil
350+
return codes.Unknown, fmt.Errorf("Not a context error: %w", err)
358351
}
359352

360-
func existingErrorCode(err error) *codes.Code {
353+
func existingErrorCode(err error) (codes.Code, error) {
361354
if err == nil {
362-
return nil
355+
return codes.Unknown, fmt.Errorf("null error")
363356
}
364357
if status, ok := status.FromError(err); ok {
365-
return errCodePtr(status.Code())
358+
return status.Code(), nil
366359
}
367-
return nil
368-
}
369-
370-
func errCodePtr(code codes.Code) *codes.Code {
371-
return &code
360+
return codes.Unknown, fmt.Errorf("no existing error code for %w", err)
372361
}
373362

374363
func LoggedError(msg string, err error) error {
375364
klog.Errorf(msg+"%v", err.Error())
376-
return status.Errorf(*CodeForError(err), msg+"%v", err.Error())
365+
return status.Errorf(CodeForError(err), msg+"%v", err.Error())
377366
}

pkg/common/utils_test.go

+27-31
Original file line numberDiff line numberDiff line change
@@ -855,57 +855,51 @@ func TestParseMachineType(t *testing.T) {
855855
}
856856

857857
func TestCodeForError(t *testing.T) {
858-
internalErrorCode := codes.Internal
859-
userErrorCode := codes.InvalidArgument
860858
testCases := []struct {
861859
name string
862860
inputErr error
863-
expCode *codes.Code
861+
expCode codes.Code
864862
}{
865863
{
866864
name: "Not googleapi.Error",
867865
inputErr: errors.New("I am not a googleapi.Error"),
868-
expCode: &internalErrorCode,
866+
expCode: codes.Internal,
869867
},
870868
{
871869
name: "User error",
872870
inputErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
873-
expCode: &userErrorCode,
871+
expCode: codes.InvalidArgument,
874872
},
875873
{
876874
name: "googleapi.Error but not a user error",
877875
inputErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
878-
expCode: &internalErrorCode,
876+
expCode: codes.Internal,
879877
},
880878
{
881879
name: "context canceled error",
882880
inputErr: context.Canceled,
883-
expCode: errCodePtr(codes.Canceled),
881+
expCode: codes.Canceled,
884882
},
885883
{
886884
name: "context deadline exceeded error",
887885
inputErr: context.DeadlineExceeded,
888-
expCode: errCodePtr(codes.DeadlineExceeded),
886+
expCode: codes.DeadlineExceeded,
889887
},
890888
{
891889
name: "status error with Aborted error code",
892890
inputErr: status.Error(codes.Aborted, "aborted error"),
893-
expCode: errCodePtr(codes.Aborted),
891+
expCode: codes.Aborted,
894892
},
895893
{
896894
name: "nil error",
897895
inputErr: nil,
898-
expCode: nil,
896+
expCode: codes.Internal,
899897
},
900898
}
901899

902900
for _, tc := range testCases {
903-
t.Logf("Running test: %v", tc.name)
904901
errCode := CodeForError(tc.inputErr)
905-
if (tc.expCode == nil) != (errCode == nil) {
906-
t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode)
907-
}
908-
if tc.expCode != nil && *errCode != *tc.expCode {
902+
if errCode != tc.expCode {
909903
t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode)
910904
}
911905
}
@@ -915,46 +909,48 @@ func TestIsContextError(t *testing.T) {
915909
cases := []struct {
916910
name string
917911
err error
918-
expectedErrCode *codes.Code
912+
expectedErrCode codes.Code
913+
expectError bool
919914
}{
920915
{
921916
name: "deadline exceeded error",
922917
err: context.DeadlineExceeded,
923-
expectedErrCode: errCodePtr(codes.DeadlineExceeded),
918+
expectedErrCode: codes.DeadlineExceeded,
924919
},
925920
{
926921
name: "contains 'context deadline exceeded'",
927922
err: fmt.Errorf("got error: %w", context.DeadlineExceeded),
928-
expectedErrCode: errCodePtr(codes.DeadlineExceeded),
923+
expectedErrCode: codes.DeadlineExceeded,
929924
},
930925
{
931926
name: "context canceled error",
932927
err: context.Canceled,
933-
expectedErrCode: errCodePtr(codes.Canceled),
928+
expectedErrCode: codes.Canceled,
934929
},
935930
{
936931
name: "contains 'context canceled'",
937932
err: fmt.Errorf("got error: %w", context.Canceled),
938-
expectedErrCode: errCodePtr(codes.Canceled),
933+
expectedErrCode: codes.Canceled,
939934
},
940935
{
941-
name: "does not contain 'context canceled' or 'context deadline exceeded'",
942-
err: fmt.Errorf("unknown error"),
943-
expectedErrCode: nil,
936+
name: "does not contain 'context canceled' or 'context deadline exceeded'",
937+
err: fmt.Errorf("unknown error"),
938+
expectError: true,
944939
},
945940
{
946-
name: "nil error",
947-
err: nil,
948-
expectedErrCode: nil,
941+
name: "nil error",
942+
err: nil,
943+
expectError: true,
949944
},
950945
}
951946

952947
for _, test := range cases {
953-
errCode := isContextError(test.err)
954-
if (test.expectedErrCode == nil) != (errCode == nil) {
955-
t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode)
956-
}
957-
if test.expectedErrCode != nil && *errCode != *test.expectedErrCode {
948+
errCode, err := isContextError(test.err)
949+
if test.expectError {
950+
if err == nil {
951+
t.Errorf("test %v failed, expected error, got %v", test.name, errCode)
952+
}
953+
} else if errCode != test.expectedErrCode {
958954
t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode)
959955
}
960956
}

pkg/gce-pd-csi-driver/controller.go

+24-10
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,12 @@ type GCEControllerServer struct {
9292
errorBackoff *csiErrorBackoff
9393
}
9494

95+
type csiErrorBackoffId string
96+
9597
type csiErrorBackoff struct {
96-
backoff *flowcontrol.Backoff
98+
backoff *flowcontrol.Backoff
99+
errorCodes map[csiErrorBackoffId]codes.Code
97100
}
98-
type csiErrorBackoffId string
99101

100102
type workItem struct {
101103
ctx context.Context
@@ -494,13 +496,13 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r
494496

495497
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
496498
if gceCS.errorBackoff.blocking(backoffId) {
497-
return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
499+
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
498500
}
499501

500502
resp, err, diskTypeForMetric := gceCS.executeControllerPublishVolume(ctx, req)
501503
if err != nil {
502-
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err.Error())
503-
gceCS.errorBackoff.next(backoffId)
504+
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
505+
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
504506
} else {
505507
klog.Infof("For node %s clear backoff due to successful publish of volume %v", req.NodeId, req.VolumeId)
506508
gceCS.errorBackoff.reset(backoffId)
@@ -654,12 +656,12 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,
654656
// Only valid requests will be queued
655657
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
656658
if gceCS.errorBackoff.blocking(backoffId) {
657-
return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
659+
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
658660
}
659661
resp, err, diskTypeForMetric := gceCS.executeControllerUnpublishVolume(ctx, req)
660662
if err != nil {
661-
klog.Infof("For node %s adding backoff due to error for volume %s", req.NodeId, req.VolumeId)
662-
gceCS.errorBackoff.next(backoffId)
663+
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
664+
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
663665
} else {
664666
klog.Infof("For node %s clear backoff due to successful unpublish of volume %v", req.NodeId, req.VolumeId)
665667
gceCS.errorBackoff.reset(backoffId)
@@ -1838,7 +1840,7 @@ func pickRandAndConsecutive(slice []string, n int) ([]string, error) {
18381840
}
18391841

18401842
func newCsiErrorBackoff(initialDuration, errorBackoffMaxDuration time.Duration) *csiErrorBackoff {
1841-
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration)}
1843+
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration), make(map[csiErrorBackoffId]codes.Code)}
18421844
}
18431845

18441846
func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId {
@@ -1850,10 +1852,22 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool {
18501852
return blk
18511853
}
18521854

1853-
func (b *csiErrorBackoff) next(id csiErrorBackoffId) {
1855+
func (b *csiErrorBackoff) code(id csiErrorBackoffId) codes.Code {
1856+
if code, ok := b.errorCodes[id]; ok {
1857+
return code
1858+
}
1859+
// If we haven't recorded a code, return unavailable, which signals a problem with the driver
1860+
// (ie, next() wasn't called correctly).
1861+
klog.Errorf("using default code for %s", id)
1862+
return codes.Unavailable
1863+
}
1864+
1865+
func (b *csiErrorBackoff) next(id csiErrorBackoffId, code codes.Code) {
18541866
b.backoff.Next(string(id), b.backoff.Clock.Now())
1867+
b.errorCodes[id] = code
18551868
}
18561869

18571870
func (b *csiErrorBackoff) reset(id csiErrorBackoffId) {
18581871
b.backoff.Reset(string(id))
1872+
delete(b.errorCodes, id)
18591873
}

0 commit comments

Comments
 (0)