Skip to content

[release-1.10] Use original error code when backing off #1311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 37 additions & 48 deletions pkg/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ var (
// Full or partial URL of the machine type resource, in the format:
// zones/zone/machineTypes/machine-type
machineTypeRegex = regexp.MustCompile(machineTypePattern)

// userErrorCodeMap tells how API error types are translated to error codes.
userErrorCodeMap = map[int]codes.Code{
http.StatusForbidden: codes.PermissionDenied,
http.StatusBadRequest: codes.InvalidArgument,
http.StatusTooManyRequests: codes.ResourceExhausted,
http.StatusNotFound: codes.NotFound,
}
)

func BytesToGbRoundDown(bytes int64) int64 {
Expand Down Expand Up @@ -318,82 +326,63 @@ func ParseMachineType(machineTypeUrl string) (string, error) {
return machineType[1], nil
}

// CodeForError returns a pointer to the grpc error code that maps to the http
// error code for the passed in user googleapi error or context error. Returns
// codes.Internal if the given error is not a googleapi error caused by the user.
// The following http error codes are considered user errors:
// (1) http 400 Bad Request, returns grpc InvalidArgument,
// (2) http 403 Forbidden, returns grpc PermissionDenied,
// (3) http 404 Not Found, returns grpc NotFound
// (4) http 429 Too Many Requests, returns grpc ResourceExhausted
// The following errors are considered context errors:
// (1) "context deadline exceeded", returns grpc DeadlineExceeded,
// (2) "context canceled", returns grpc Canceled
func CodeForError(err error) *codes.Code {
if err == nil {
return nil
// CodeForError returns the grpc error code that maps to the http error code for the
// passed in user googleapi error or context error. Returns codes.Internal if the given
// error is not a googleapi error caused by the user. userErrorCodeMap is used for
// encoding most errors.
func CodeForError(sourceError error) codes.Code {
if sourceError == nil {
return codes.Internal
}

if errCode := existingErrorCode(err); errCode != nil {
return errCode
if code, err := existingErrorCode(sourceError); err == nil {
return code
}
if code := isContextError(err); code != nil {
if code, err := isContextError(sourceError); err == nil {
return code
}

internalErrorCode := codes.Internal
// Upwrap the error
var apiErr *googleapi.Error
if !errors.As(err, &apiErr) {
return &internalErrorCode
}

userErrors := map[int]codes.Code{
http.StatusForbidden: codes.PermissionDenied,
http.StatusBadRequest: codes.InvalidArgument,
http.StatusTooManyRequests: codes.ResourceExhausted,
http.StatusNotFound: codes.NotFound,
if !errors.As(sourceError, &apiErr) {
return codes.Internal
}
if code, ok := userErrors[apiErr.Code]; ok {
return &code
if code, ok := userErrorCodeMap[apiErr.Code]; ok {
return code
}

return &internalErrorCode
return codes.Internal
}

// isContextError returns a pointer to the grpc error code DeadlineExceeded
// if the passed in error contains the "context deadline exceeded" string and returns
// the grpc error code Canceled if the error contains the "context canceled" string.
func isContextError(err error) *codes.Code {
// isContextError returns the grpc error code DeadlineExceeded if the passed in error
// contains the "context deadline exceeded" string and returns the grpc error code
// Canceled if the error contains the "context canceled" string. It returns and error if
// err isn't a context error.
func isContextError(err error) (codes.Code, error) {
if err == nil {
return nil
return codes.Unknown, fmt.Errorf("null error")
}

errStr := err.Error()
if strings.Contains(errStr, context.DeadlineExceeded.Error()) {
return errCodePtr(codes.DeadlineExceeded)
return codes.DeadlineExceeded, nil
}
if strings.Contains(errStr, context.Canceled.Error()) {
return errCodePtr(codes.Canceled)
return codes.Canceled, nil
}
return nil
return codes.Unknown, fmt.Errorf("Not a context error: %w", err)
}

func existingErrorCode(err error) *codes.Code {
func existingErrorCode(err error) (codes.Code, error) {
if err == nil {
return nil
return codes.Unknown, fmt.Errorf("null error")
}
if status, ok := status.FromError(err); ok {
return errCodePtr(status.Code())
return status.Code(), nil
}
return nil
}

func errCodePtr(code codes.Code) *codes.Code {
return &code
return codes.Unknown, fmt.Errorf("no existing error code for %w", err)
}

func LoggedError(msg string, err error) error {
klog.Errorf(msg+"%v", err.Error())
return status.Errorf(*CodeForError(err), msg+"%v", err.Error())
return status.Errorf(CodeForError(err), msg+"%v", err.Error())
}
58 changes: 27 additions & 31 deletions pkg/common/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -975,57 +975,51 @@ func TestParseMachineType(t *testing.T) {
}

func TestCodeForError(t *testing.T) {
internalErrorCode := codes.Internal
userErrorCode := codes.InvalidArgument
testCases := []struct {
name string
inputErr error
expCode *codes.Code
expCode codes.Code
}{
{
name: "Not googleapi.Error",
inputErr: errors.New("I am not a googleapi.Error"),
expCode: &internalErrorCode,
expCode: codes.Internal,
},
{
name: "User error",
inputErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
expCode: &userErrorCode,
expCode: codes.InvalidArgument,
},
{
name: "googleapi.Error but not a user error",
inputErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
expCode: &internalErrorCode,
expCode: codes.Internal,
},
{
name: "context canceled error",
inputErr: context.Canceled,
expCode: errCodePtr(codes.Canceled),
expCode: codes.Canceled,
},
{
name: "context deadline exceeded error",
inputErr: context.DeadlineExceeded,
expCode: errCodePtr(codes.DeadlineExceeded),
expCode: codes.DeadlineExceeded,
},
{
name: "status error with Aborted error code",
inputErr: status.Error(codes.Aborted, "aborted error"),
expCode: errCodePtr(codes.Aborted),
expCode: codes.Aborted,
},
{
name: "nil error",
inputErr: nil,
expCode: nil,
expCode: codes.Internal,
},
}

for _, tc := range testCases {
t.Logf("Running test: %v", tc.name)
errCode := CodeForError(tc.inputErr)
if (tc.expCode == nil) != (errCode == nil) {
t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode)
}
if tc.expCode != nil && *errCode != *tc.expCode {
if errCode != tc.expCode {
t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode)
}
}
Expand All @@ -1035,46 +1029,48 @@ func TestIsContextError(t *testing.T) {
cases := []struct {
name string
err error
expectedErrCode *codes.Code
expectedErrCode codes.Code
expectError bool
}{
{
name: "deadline exceeded error",
err: context.DeadlineExceeded,
expectedErrCode: errCodePtr(codes.DeadlineExceeded),
expectedErrCode: codes.DeadlineExceeded,
},
{
name: "contains 'context deadline exceeded'",
err: fmt.Errorf("got error: %w", context.DeadlineExceeded),
expectedErrCode: errCodePtr(codes.DeadlineExceeded),
expectedErrCode: codes.DeadlineExceeded,
},
{
name: "context canceled error",
err: context.Canceled,
expectedErrCode: errCodePtr(codes.Canceled),
expectedErrCode: codes.Canceled,
},
{
name: "contains 'context canceled'",
err: fmt.Errorf("got error: %w", context.Canceled),
expectedErrCode: errCodePtr(codes.Canceled),
expectedErrCode: codes.Canceled,
},
{
name: "does not contain 'context canceled' or 'context deadline exceeded'",
err: fmt.Errorf("unknown error"),
expectedErrCode: nil,
name: "does not contain 'context canceled' or 'context deadline exceeded'",
err: fmt.Errorf("unknown error"),
expectError: true,
},
{
name: "nil error",
err: nil,
expectedErrCode: nil,
name: "nil error",
err: nil,
expectError: true,
},
}

for _, test := range cases {
errCode := isContextError(test.err)
if (test.expectedErrCode == nil) != (errCode == nil) {
t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode)
}
if test.expectedErrCode != nil && *errCode != *test.expectedErrCode {
errCode, err := isContextError(test.err)
if test.expectError {
if err == nil {
t.Errorf("test %v failed, expected error, got %v", test.name, errCode)
}
} else if errCode != test.expectedErrCode {
t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode)
}
}
Expand Down
34 changes: 24 additions & 10 deletions pkg/gce-pd-csi-driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,12 @@ type GCEControllerServer struct {
errorBackoff *csiErrorBackoff
}

type csiErrorBackoffId string

type csiErrorBackoff struct {
backoff *flowcontrol.Backoff
backoff *flowcontrol.Backoff
errorCodes map[csiErrorBackoffId]codes.Code
}
type csiErrorBackoffId string

type workItem struct {
ctx context.Context
Expand Down Expand Up @@ -504,13 +506,13 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r

backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
if gceCS.errorBackoff.blocking(backoffId) {
return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId)
}

resp, err, diskTypeForMetric := gceCS.executeControllerPublishVolume(ctx, req)
if err != nil {
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err.Error())
gceCS.errorBackoff.next(backoffId)
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
} else {
klog.Infof("For node %s clear backoff due to successful publish of volume %v", req.NodeId, req.VolumeId)
gceCS.errorBackoff.reset(backoffId)
Expand Down Expand Up @@ -669,12 +671,12 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,
// Only valid requests will be queued
backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId)
if gceCS.errorBackoff.blocking(backoffId) {
return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId)
}
resp, err, diskTypeForMetric := gceCS.executeControllerUnpublishVolume(ctx, req)
if err != nil {
klog.Infof("For node %s adding backoff due to error for volume %s", req.NodeId, req.VolumeId)
gceCS.errorBackoff.next(backoffId)
klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err)
gceCS.errorBackoff.next(backoffId, common.CodeForError(err))
} else {
klog.Infof("For node %s clear backoff due to successful unpublish of volume %v", req.NodeId, req.VolumeId)
gceCS.errorBackoff.reset(backoffId)
Expand Down Expand Up @@ -1876,7 +1878,7 @@ func pickRandAndConsecutive(slice []string, n int) ([]string, error) {
}

func newCsiErrorBackoff(initialDuration, errorBackoffMaxDuration time.Duration) *csiErrorBackoff {
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration)}
return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration), make(map[csiErrorBackoffId]codes.Code)}
}

func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId {
Expand All @@ -1888,10 +1890,22 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool {
return blk
}

func (b *csiErrorBackoff) next(id csiErrorBackoffId) {
func (b *csiErrorBackoff) code(id csiErrorBackoffId) codes.Code {
if code, ok := b.errorCodes[id]; ok {
return code
}
// If we haven't recorded a code, return unavailable, which signals a problem with the driver
// (ie, next() wasn't called correctly).
klog.Errorf("using default code for %s", id)
return codes.Unavailable
}

func (b *csiErrorBackoff) next(id csiErrorBackoffId, code codes.Code) {
b.backoff.Next(string(id), b.backoff.Clock.Now())
b.errorCodes[id] = code
}

func (b *csiErrorBackoff) reset(id csiErrorBackoffId) {
b.backoff.Reset(string(id))
delete(b.errorCodes, id)
}
Loading