Skip to content

Commit 754d096

Browse files
authored
Merge pull request #1327 from judemars/automated-cherry-pick-of-#1298-upstream-release-1.8
[release-1.8] Automated cherry pick of #1298: Use original error code when backing off
2 parents 5af16b9 + 624fcac commit 754d096

File tree

4 files changed

+123
-101
lines changed

4 files changed

+123
-101
lines changed

pkg/common/utils.go

+37-48
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ var (
7979
// Full or partial URL of the machine type resource, in the format:
8080
// zones/zone/machineTypes/machine-type
8181
machineTypeRegex = regexp.MustCompile(machineTypePattern)
82+
83+
// userErrorCodeMap tells how API error types are translated to error codes.
84+
userErrorCodeMap = map[int]codes.Code{
85+
http.StatusForbidden: codes.PermissionDenied,
86+
http.StatusBadRequest: codes.InvalidArgument,
87+
http.StatusTooManyRequests: codes.ResourceExhausted,
88+
http.StatusNotFound: codes.NotFound,
89+
}
8290
)
8391

8492
func BytesToGbRoundDown(bytes int64) int64 {
@@ -296,82 +304,63 @@ func ParseMachineType(machineTypeUrl string) (string, error) {
296304
return machineType[1], nil
297305
}
298306

299-
// CodeForError returns a pointer to the grpc error code that maps to the http
300-
// error code for the passed in user googleapi error or context error. Returns
301-
// codes.Internal if the given error is not a googleapi error caused by the user.
302-
// The following http error codes are considered user errors:
303-
// (1) http 400 Bad Request, returns grpc InvalidArgument,
304-
// (2) http 403 Forbidden, returns grpc PermissionDenied,
305-
// (3) http 404 Not Found, returns grpc NotFound
306-
// (4) http 429 Too Many Requests, returns grpc ResourceExhausted
307-
// The following errors are considered context errors:
308-
// (1) "context deadline exceeded", returns grpc DeadlineExceeded,
309-
// (2) "context canceled", returns grpc Canceled
310-
func CodeForError(err error) *codes.Code {
311-
if err == nil {
312-
return nil
307+
// CodeForError returns the grpc error code that maps to the http error code for the
308+
// passed in user googleapi error or context error. Returns codes.Internal if the given
309+
// error is not a googleapi error caused by the user. userErrorCodeMap is used for
310+
// encoding most errors.
311+
func CodeForError(sourceError error) codes.Code {
312+
if sourceError == nil {
313+
return codes.Internal
313314
}
314315

315-
if errCode := existingErrorCode(err); errCode != nil {
316-
return errCode
316+
if code, err := existingErrorCode(sourceError); err == nil {
317+
return code
317318
}
318-
if code := isContextError(err); code != nil {
319+
if code, err := isContextError(sourceError); err == nil {
319320
return code
320321
}
321322

322-
internalErrorCode := codes.Internal
323-
// Upwrap the error
324323
var apiErr *googleapi.Error
325-
if !errors.As(err, &apiErr) {
326-
return &internalErrorCode
327-
}
328-
329-
userErrors := map[int]codes.Code{
330-
http.StatusForbidden: codes.PermissionDenied,
331-
http.StatusBadRequest: codes.InvalidArgument,
332-
http.StatusTooManyRequests: codes.ResourceExhausted,
333-
http.StatusNotFound: codes.NotFound,
324+
if !errors.As(sourceError, &apiErr) {
325+
return codes.Internal
334326
}
335-
if code, ok := userErrors[apiErr.Code]; ok {
336-
return &code
327+
if code, ok := userErrorCodeMap[apiErr.Code]; ok {
328+
return code
337329
}
338330

339-
return &internalErrorCode
331+
return codes.Internal
340332
}
341333

342-
// isContextError returns a pointer to the grpc error code DeadlineExceeded
343-
// if the passed in error contains the "context deadline exceeded" string and returns
344-
// the grpc error code Canceled if the error contains the "context canceled" string.
345-
func isContextError(err error) *codes.Code {
334+
// isContextError returns the grpc error code DeadlineExceeded if the passed in error
335+
// contains the "context deadline exceeded" string and returns the grpc error code
336+
// Canceled if the error contains the "context canceled" string. It returns and error if
337+
// err isn't a context error.
338+
func isContextError(err error) (codes.Code, error) {
346339
if err == nil {
347-
return nil
340+
return codes.Unknown, fmt.Errorf("null error")
348341
}
349342

350343
errStr := err.Error()
351344
if strings.Contains(errStr, context.DeadlineExceeded.Error()) {
352-
return errCodePtr(codes.DeadlineExceeded)
345+
return codes.DeadlineExceeded, nil
353346
}
354347
if strings.Contains(errStr, context.Canceled.Error()) {
355-
return errCodePtr(codes.Canceled)
348+
return codes.Canceled, nil
356349
}
357-
return nil
350+
return codes.Unknown, fmt.Errorf("Not a context error: %w", err)
358351
}
359352

360-
func existingErrorCode(err error) *codes.Code {
353+
func existingErrorCode(err error) (codes.Code, error) {
361354
if err == nil {
362-
return nil
355+
return codes.Unknown, fmt.Errorf("null error")
363356
}
364357
if status, ok := status.FromError(err); ok {
365-
return errCodePtr(status.Code())
358+
return status.Code(), nil
366359
}
367-
return nil
368-
}
369-
370-
func errCodePtr(code codes.Code) *codes.Code {
371-
return &code
360+
return codes.Unknown, fmt.Errorf("no existing error code for %w", err)
372361
}
373362

374363
func LoggedError(msg string, err error) error {
375364
klog.Errorf(msg+"%v", err.Error())
376-
return status.Errorf(*CodeForError(err), msg+"%v", err.Error())
365+
return status.Errorf(CodeForError(err), msg+"%v", err.Error())
377366
}

pkg/common/utils_test.go

+27-31
Original file line numberDiff line numberDiff line change
@@ -867,57 +867,51 @@ func TestParseMachineType(t *testing.T) {
867867
}
868868

869869
func TestCodeForError(t *testing.T) {
870-
internalErrorCode := codes.Internal
871-
userErrorCode := codes.InvalidArgument
872870
testCases := []struct {
873871
name string
874872
inputErr error
875-
expCode *codes.Code
873+
expCode codes.Code
876874
}{
877875
{
878876
name: "Not googleapi.Error",
879877
inputErr: errors.New("I am not a googleapi.Error"),
880-
expCode: &internalErrorCode,
878+
expCode: codes.Internal,
881879
},
882880
{
883881
name: "User error",
884882
inputErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
885-
expCode: &userErrorCode,
883+
expCode: codes.InvalidArgument,
886884
},
887885
{
888886
name: "googleapi.Error but not a user error",
889887
inputErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
890-
expCode: &internalErrorCode,
888+
expCode: codes.Internal,
891889
},
892890
{
893891
name: "context canceled error",
894892
inputErr: context.Canceled,
895-
expCode: errCodePtr(codes.Canceled),
893+
expCode: codes.Canceled,
896894
},
897895
{
898896
name: "context deadline exceeded error",
899897
inputErr: context.DeadlineExceeded,
900-
expCode: errCodePtr(codes.DeadlineExceeded),
898+
expCode: codes.DeadlineExceeded,
901899
},
902900
{
903901
name: "status error with Aborted error code",
904902
inputErr: status.Error(codes.Aborted, "aborted error"),
905-
expCode: errCodePtr(codes.Aborted),
903+
expCode: codes.Aborted,
906904
},
907905
{
908906
name: "nil error",
909907
inputErr: nil,
910-
expCode: nil,
908+
expCode: codes.Internal,
911909
},
912910
}
913911

914912
for _, tc := range testCases {
915-
t.Logf("Running test: %v", tc.name)
916913
errCode := CodeForError(tc.inputErr)
917-
if (tc.expCode == nil) != (errCode == nil) {
918-
t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode)
919-
}
920-
if tc.expCode != nil && *errCode != *tc.expCode {
914+
if errCode != tc.expCode {
921915
t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode)
922916
}
923917
}
@@ -927,46 +921,48 @@ func TestIsContextError(t *testing.T) {
927921
cases := []struct {
928922
name string
929923
err error
930-
expectedErrCode *codes.Code
924+
expectedErrCode codes.Code
925+
expectError bool
931926
}{
932927
{
933928
name: "deadline exceeded error",
934929
err: context.DeadlineExceeded,
935-
expectedErrCode: errCodePtr(codes.DeadlineExceeded),
930+
expectedErrCode: codes.DeadlineExceeded,
936931
},
937932
{
938933
name: "contains 'context deadline exceeded'",
939934
err: fmt.Errorf("got error: %w", context.DeadlineExceeded),
940-
expectedErrCode: errCodePtr(codes.DeadlineExceeded),
935+
expectedErrCode: codes.DeadlineExceeded,
941936
},
942937
{
943938
name: "context canceled error",
944939
err: context.Canceled,
945-
expectedErrCode: errCodePtr(codes.Canceled),
940+
expectedErrCode: codes.Canceled,
946941
},
947942
{
948943
name: "contains 'context canceled'",
949944
err: fmt.Errorf("got error: %w", context.Canceled),
950-
expectedErrCode: errCodePtr(codes.Canceled),
945+
expectedErrCode: codes.Canceled,
951946
},
952947
{
953-
name: "does not contain 'context canceled' or 'context deadline exceeded'",
954-
err: fmt.Errorf("unknown error"),
955-
expectedErrCode: nil,
948+
name: "does not contain 'context canceled' or 'context deadline exceeded'",
949+
err: fmt.Errorf("unknown error"),
950+
expectError: true,
956951
},
957952
{
958-
name: "nil error",
959-
err: nil,
960-
expectedErrCode: nil,
953+
name: "nil error",
954+
err: nil,
955+
expectError: true,
961956
},
962957
}
963958

964959
for _, test := range cases {
965-
errCode := isContextError(test.err)
966-
if (test.expectedErrCode == nil) != (errCode == nil) {
967-
t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode)
968-
}
969-
if test.expectedErrCode != nil && *errCode != *test.expectedErrCode {
960+
errCode, err := isContextError(test.err)
961+
if test.expectError {
962+
if err == nil {
963+
t.Errorf("test %v failed, expected error, got %v", test.name, errCode)
964+
}
965+
} else if errCode != test.expectedErrCode {
970966
t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode)
971967
}
972968
}

pkg/gce-pd-csi-driver/controller.go

+24-10
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,12 @@ type GCEControllerServer struct {
9191
errorBackoff *csiErrorBackoff
9292
}
9393

94+
type csiErrorBackoffId string
95+
9496
type csiErrorBackoff struct {
95-
backoff *flowcontrol.Backoff
97+
backoff *flowcontrol.Backoff
98+
errorCodes map[csiErrorBackoffId]codes.Code
9699
}
97-
type csiErrorBackoffId string
98100

99101
type workItem struct {
100102
ctx context.Context
@@ -482,13 +484,13 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r
482484

483485
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
484486
if gceCS.errorBackoff.blocking(backoffId) {
485-
return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
487+
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
486488
}
487489

488490
resp, err, diskTypeForMetric := gceCS.executeControllerPublishVolume(ctx, req)
489491
if err != nil {
490-
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err.Error())
491-
gceCS.errorBackoff.next(backoffId)
492+
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
493+
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
492494
} else {
493495
klog.Infof("For node %s clear backoff due to successful publish of volume %v", req.NodeId, req.VolumeId)
494496
gceCS.errorBackoff.reset(backoffId)
@@ -642,12 +644,12 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,
642644
// Only valid requests will be queued
643645
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
644646
if gceCS.errorBackoff.blocking(backoffId) {
645-
return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
647+
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
646648
}
647649
resp, err, diskTypeForMetric := gceCS.executeControllerUnpublishVolume(ctx, req)
648650
if err != nil {
649-
klog.Infof("For node %s adding backoff due to error for volume %s", req.NodeId, req.VolumeId)
650-
gceCS.errorBackoff.next(backoffId)
651+
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
652+
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
651653
} else {
652654
klog.Infof("For node %s clear backoff due to successful unpublish of volume %v", req.NodeId, req.VolumeId)
653655
gceCS.errorBackoff.reset(backoffId)
@@ -1775,7 +1777,7 @@ func pickRandAndConsecutive(slice []string, n int) ([]string, error) {
17751777
}
17761778

17771779
func newCsiErrorBackoff(initialDuration, errorBackoffMaxDuration time.Duration) *csiErrorBackoff {
1778-
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration)}
1780+
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration), make(map[csiErrorBackoffId]codes.Code)}
17791781
}
17801782

17811783
func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId {
@@ -1787,10 +1789,22 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool {
17871789
return blk
17881790
}
17891791

1790-
func (b *csiErrorBackoff) next(id csiErrorBackoffId) {
1792+
func (b *csiErrorBackoff) code(id csiErrorBackoffId) codes.Code {
1793+
if code, ok := b.errorCodes[id]; ok {
1794+
return code
1795+
}
1796+
// If we haven't recorded a code, return unavailable, which signals a problem with the driver
1797+
// (ie, next() wasn't called correctly).
1798+
klog.Errorf("using default code for %s", id)
1799+
return codes.Unavailable
1800+
}
1801+
1802+
func (b *csiErrorBackoff) next(id csiErrorBackoffId, code codes.Code) {
17911803
b.backoff.Next(string(id), b.backoff.Clock.Now())
1804+
b.errorCodes[id] = code
17921805
}
17931806

17941807
func (b *csiErrorBackoff) reset(id csiErrorBackoffId) {
17951808
b.backoff.Reset(string(id))
1809+
delete(b.errorCodes, id)
17961810
}

0 commit comments

Comments
 (0)