From 7178fdbf509672fb1a935b47b3b600bcd4f21b83 Mon Sep 17 00:00:00 2001 From: Matthew Cary Date: Tue, 18 Jul 2023 14:26:57 -0700 Subject: [PATCH] Use original error code when backing off Change-Id: If514a26bf513fc47de6e74615f352a83268f5ecf --- pkg/common/utils.go | 85 +++++++++++------------- pkg/common/utils_test.go | 58 ++++++++-------- pkg/gce-pd-csi-driver/controller.go | 34 +++++++--- pkg/gce-pd-csi-driver/controller_test.go | 48 +++++++++---- 4 files changed, 123 insertions(+), 102 deletions(-) diff --git a/pkg/common/utils.go b/pkg/common/utils.go index 26ecb5a8d..1fd22cbab 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -79,6 +79,14 @@ var ( // Full or partial URL of the machine type resource, in the format: // zones/zone/machineTypes/machine-type machineTypeRegex = regexp.MustCompile(machineTypePattern) + + // userErrorCodeMap tells how API error types are translated to error codes. + userErrorCodeMap = map[int]codes.Code{ + http.StatusForbidden: codes.PermissionDenied, + http.StatusBadRequest: codes.InvalidArgument, + http.StatusTooManyRequests: codes.ResourceExhausted, + http.StatusNotFound: codes.NotFound, + } ) func BytesToGbRoundDown(bytes int64) int64 { @@ -318,82 +326,63 @@ func ParseMachineType(machineTypeUrl string) (string, error) { return machineType[1], nil } -// CodeForError returns a pointer to the grpc error code that maps to the http -// error code for the passed in user googleapi error or context error. Returns -// codes.Internal if the given error is not a googleapi error caused by the user. -// The following http error codes are considered user errors: -// (1) http 400 Bad Request, returns grpc InvalidArgument, -// (2) http 403 Forbidden, returns grpc PermissionDenied, -// (3) http 404 Not Found, returns grpc NotFound -// (4) http 429 Too Many Requests, returns grpc ResourceExhausted -// The following errors are considered context errors: -// (1) "context deadline exceeded", returns grpc DeadlineExceeded, -// (2) "context canceled", returns grpc Canceled -func CodeForError(err error) *codes.Code { - if err == nil { - return nil +// CodeForError returns the grpc error code that maps to the http error code for the +// passed in user googleapi error or context error. Returns codes.Internal if the given +// error is not a googleapi error caused by the user. userErrorCodeMap is used for +// encoding most errors. +func CodeForError(sourceError error) codes.Code { + if sourceError == nil { + return codes.Internal } - if errCode := existingErrorCode(err); errCode != nil { - return errCode + if code, err := existingErrorCode(sourceError); err == nil { + return code } - if code := isContextError(err); code != nil { + if code, err := isContextError(sourceError); err == nil { return code } - internalErrorCode := codes.Internal - // Upwrap the error var apiErr *googleapi.Error - if !errors.As(err, &apiErr) { - return &internalErrorCode - } - - userErrors := map[int]codes.Code{ - http.StatusForbidden: codes.PermissionDenied, - http.StatusBadRequest: codes.InvalidArgument, - http.StatusTooManyRequests: codes.ResourceExhausted, - http.StatusNotFound: codes.NotFound, + if !errors.As(sourceError, &apiErr) { + return codes.Internal } - if code, ok := userErrors[apiErr.Code]; ok { - return &code + if code, ok := userErrorCodeMap[apiErr.Code]; ok { + return code } - return &internalErrorCode + return codes.Internal } -// isContextError returns a pointer to the grpc error code DeadlineExceeded -// if the passed in error contains the "context deadline exceeded" string and returns -// the grpc error code Canceled if the error contains the "context canceled" string. -func isContextError(err error) *codes.Code { +// isContextError returns the grpc error code DeadlineExceeded if the passed in error +// contains the "context deadline exceeded" string and returns the grpc error code +// Canceled if the error contains the "context canceled" string. It returns and error if +// err isn't a context error. +func isContextError(err error) (codes.Code, error) { if err == nil { - return nil + return codes.Unknown, fmt.Errorf("null error") } errStr := err.Error() if strings.Contains(errStr, context.DeadlineExceeded.Error()) { - return errCodePtr(codes.DeadlineExceeded) + return codes.DeadlineExceeded, nil } if strings.Contains(errStr, context.Canceled.Error()) { - return errCodePtr(codes.Canceled) + return codes.Canceled, nil } - return nil + return codes.Unknown, fmt.Errorf("Not a context error: %w", err) } -func existingErrorCode(err error) *codes.Code { +func existingErrorCode(err error) (codes.Code, error) { if err == nil { - return nil + return codes.Unknown, fmt.Errorf("null error") } if status, ok := status.FromError(err); ok { - return errCodePtr(status.Code()) + return status.Code(), nil } - return nil -} - -func errCodePtr(code codes.Code) *codes.Code { - return &code + return codes.Unknown, fmt.Errorf("no existing error code for %w", err) } func LoggedError(msg string, err error) error { klog.Errorf(msg+"%v", err.Error()) - return status.Errorf(*CodeForError(err), msg+"%v", err.Error()) + return status.Errorf(CodeForError(err), msg+"%v", err.Error()) } diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go index 05ddc3565..812fb25e7 100644 --- a/pkg/common/utils_test.go +++ b/pkg/common/utils_test.go @@ -975,57 +975,51 @@ func TestParseMachineType(t *testing.T) { } func TestCodeForError(t *testing.T) { - internalErrorCode := codes.Internal - userErrorCode := codes.InvalidArgument testCases := []struct { name string inputErr error - expCode *codes.Code + expCode codes.Code }{ { name: "Not googleapi.Error", inputErr: errors.New("I am not a googleapi.Error"), - expCode: &internalErrorCode, + expCode: codes.Internal, }, { name: "User error", inputErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}, - expCode: &userErrorCode, + expCode: codes.InvalidArgument, }, { name: "googleapi.Error but not a user error", inputErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"}, - expCode: &internalErrorCode, + expCode: codes.Internal, }, { name: "context canceled error", inputErr: context.Canceled, - expCode: errCodePtr(codes.Canceled), + expCode: codes.Canceled, }, { name: "context deadline exceeded error", inputErr: context.DeadlineExceeded, - expCode: errCodePtr(codes.DeadlineExceeded), + expCode: codes.DeadlineExceeded, }, { name: "status error with Aborted error code", inputErr: status.Error(codes.Aborted, "aborted error"), - expCode: errCodePtr(codes.Aborted), + expCode: codes.Aborted, }, { name: "nil error", inputErr: nil, - expCode: nil, + expCode: codes.Internal, }, } for _, tc := range testCases { - t.Logf("Running test: %v", tc.name) errCode := CodeForError(tc.inputErr) - if (tc.expCode == nil) != (errCode == nil) { - t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode) - } - if tc.expCode != nil && *errCode != *tc.expCode { + if errCode != tc.expCode { t.Errorf("test %v failed: got %v, expected %v", tc.name, errCode, tc.expCode) } } @@ -1035,46 +1029,48 @@ func TestIsContextError(t *testing.T) { cases := []struct { name string err error - expectedErrCode *codes.Code + expectedErrCode codes.Code + expectError bool }{ { name: "deadline exceeded error", err: context.DeadlineExceeded, - expectedErrCode: errCodePtr(codes.DeadlineExceeded), + expectedErrCode: codes.DeadlineExceeded, }, { name: "contains 'context deadline exceeded'", err: fmt.Errorf("got error: %w", context.DeadlineExceeded), - expectedErrCode: errCodePtr(codes.DeadlineExceeded), + expectedErrCode: codes.DeadlineExceeded, }, { name: "context canceled error", err: context.Canceled, - expectedErrCode: errCodePtr(codes.Canceled), + expectedErrCode: codes.Canceled, }, { name: "contains 'context canceled'", err: fmt.Errorf("got error: %w", context.Canceled), - expectedErrCode: errCodePtr(codes.Canceled), + expectedErrCode: codes.Canceled, }, { - name: "does not contain 'context canceled' or 'context deadline exceeded'", - err: fmt.Errorf("unknown error"), - expectedErrCode: nil, + name: "does not contain 'context canceled' or 'context deadline exceeded'", + err: fmt.Errorf("unknown error"), + expectError: true, }, { - name: "nil error", - err: nil, - expectedErrCode: nil, + name: "nil error", + err: nil, + expectError: true, }, } for _, test := range cases { - errCode := isContextError(test.err) - if (test.expectedErrCode == nil) != (errCode == nil) { - t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode) - } - if test.expectedErrCode != nil && *errCode != *test.expectedErrCode { + errCode, err := isContextError(test.err) + if test.expectError { + if err == nil { + t.Errorf("test %v failed, expected error, got %v", test.name, errCode) + } + } else if errCode != test.expectedErrCode { t.Errorf("test %v failed: got %v, expected %v", test.name, errCode, test.expectedErrCode) } } diff --git a/pkg/gce-pd-csi-driver/controller.go b/pkg/gce-pd-csi-driver/controller.go index d27937b0b..ffde2adb6 100644 --- a/pkg/gce-pd-csi-driver/controller.go +++ b/pkg/gce-pd-csi-driver/controller.go @@ -92,10 +92,12 @@ type GCEControllerServer struct { errorBackoff *csiErrorBackoff } +type csiErrorBackoffId string + type csiErrorBackoff struct { - backoff *flowcontrol.Backoff + backoff *flowcontrol.Backoff + errorCodes map[csiErrorBackoffId]codes.Code } -type csiErrorBackoffId string type workItem struct { ctx context.Context @@ -496,13 +498,13 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId) if gceCS.errorBackoff.blocking(backoffId) { - return nil, status.Errorf(codes.Unavailable, "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId) + return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerPublish not permitted on node %q due to backoff condition", req.NodeId) } resp, err, diskTypeForMetric := gceCS.executeControllerPublishVolume(ctx, req) if err != nil { - klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err.Error()) - gceCS.errorBackoff.next(backoffId) + klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err) + gceCS.errorBackoff.next(backoffId, common.CodeForError(err)) } else { klog.Infof("For node %s clear backoff due to successful publish of volume %v", req.NodeId, req.VolumeId) gceCS.errorBackoff.reset(backoffId) @@ -661,12 +663,12 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context, // Only valid requests will be queued backoffId := gceCS.errorBackoff.backoffId(req.NodeId, req.VolumeId) if gceCS.errorBackoff.blocking(backoffId) { - return nil, status.Errorf(codes.Unavailable, "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId) + return nil, status.Errorf(gceCS.errorBackoff.code(backoffId), "ControllerUnpublish not permitted on node %q due to backoff condition", req.NodeId) } resp, err, diskTypeForMetric := gceCS.executeControllerUnpublishVolume(ctx, req) if err != nil { - klog.Infof("For node %s adding backoff due to error for volume %s", req.NodeId, req.VolumeId) - gceCS.errorBackoff.next(backoffId) + klog.Infof("For node %s adding backoff due to error for volume %s: %v", req.NodeId, req.VolumeId, err) + gceCS.errorBackoff.next(backoffId, common.CodeForError(err)) } else { klog.Infof("For node %s clear backoff due to successful unpublish of volume %v", req.NodeId, req.VolumeId) gceCS.errorBackoff.reset(backoffId) @@ -1816,7 +1818,7 @@ func pickRandAndConsecutive(slice []string, n int) ([]string, error) { } func newCsiErrorBackoff(initialDuration, errorBackoffMaxDuration time.Duration) *csiErrorBackoff { - return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration)} + return &csiErrorBackoff{flowcontrol.NewBackOff(initialDuration, errorBackoffMaxDuration), make(map[csiErrorBackoffId]codes.Code)} } func (_ *csiErrorBackoff) backoffId(nodeId, volumeId string) csiErrorBackoffId { @@ -1828,10 +1830,22 @@ func (b *csiErrorBackoff) blocking(id csiErrorBackoffId) bool { return blk } -func (b *csiErrorBackoff) next(id csiErrorBackoffId) { +func (b *csiErrorBackoff) code(id csiErrorBackoffId) codes.Code { + if code, ok := b.errorCodes[id]; ok { + return code + } + // If we haven't recorded a code, return unavailable, which signals a problem with the driver + // (ie, next() wasn't called correctly). + klog.Errorf("using default code for %s", id) + return codes.Unavailable +} + +func (b *csiErrorBackoff) next(id csiErrorBackoffId, code codes.Code) { b.backoff.Next(string(id), b.backoff.Clock.Now()) + b.errorCodes[id] = code } func (b *csiErrorBackoff) reset(id csiErrorBackoffId) { b.backoff.Reset(string(id)) + delete(b.errorCodes, id) } diff --git a/pkg/gce-pd-csi-driver/controller_test.go b/pkg/gce-pd-csi-driver/controller_test.go index d97815f10..4e0e6bb6f 100644 --- a/pkg/gce-pd-csi-driver/controller_test.go +++ b/pkg/gce-pd-csi-driver/controller_test.go @@ -257,7 +257,6 @@ func TestDeleteSnapshot(t *testing.T) { _, err := gceDriver.cs.DeleteSnapshot(context.Background(), tc.req) if err != nil { serverError, ok := status.FromError(err) - t.Logf("get server error %v", serverError) if !ok { t.Fatalf("Could not get error status code from err: %v", serverError) } @@ -2970,7 +2969,7 @@ type backoffDriverConfig struct { func newFakeCSIErrorBackoff(tc *clock.FakeClock) *csiErrorBackoff { backoff := flowcontrol.NewFakeBackOff(errorBackoffInitialDuration, errorBackoffMaxDuration, tc) - return &csiErrorBackoff{backoff} + return &csiErrorBackoff{backoff, make(map[csiErrorBackoffId]codes.Code)} } func TestControllerUnpublishBackoff(t *testing.T) { @@ -3010,7 +3009,7 @@ func TestControllerUnpublishBackoff(t *testing.T) { } // Mock an active backoff condition on the node. - driver.cs.errorBackoff.next(backoffId) + driver.cs.errorBackoff.next(backoffId, codes.Unavailable) tc.config.clock.Step(step) // A requst for a a different volume should succeed. This volume is not @@ -3028,7 +3027,9 @@ func TestControllerUnpublishBackoff(t *testing.T) { VolumeId: testVolumeID, NodeId: testNodeID, } - // For the first 199 ms, the backoff condition is true. All controller publish request will be denied with 'Unavailable' error code. + // For the first 199 ms, the backoff condition is true. All controller publish + // request will be denied with the same unavailable error code as was set on + // the original error. for i := 0; i < 199; i++ { var err error _, err = driver.cs.ControllerUnpublishVolume(context.Background(), unpubreq) @@ -3052,18 +3053,23 @@ func TestControllerUnpublishBackoff(t *testing.T) { return } - // Mock an error + // Mock an error. This will produce an Internal error, which is different from + // the default error and what's used in the failure above, so that the correct + // error code can be confirmed. if err := runUnpublishRequest(unpubreq, true); err == nil { t.Errorf("expected error") } - // The above failure should cause driver to call Backoff.Next() again and a backoff duration of 400 ms duration is set starting at the 200th millisecond. - // For the 200-599 ms, the backoff condition is true, and new controller publish requests will be deined. + // The above failure should cause driver to call backoff.next() again and a + // backoff duration of 400 ms duration is set starting at the 200th + // millisecond. For the 200-599 ms, the backoff condition is true, with an + // internal error this time, and new controller publish requests will be + // denied. for i := 0; i < 399; i++ { tc.config.clock.Step(step) var err error _, err = driver.cs.ControllerUnpublishVolume(context.Background(), unpubreq) - if !isUnavailableError(err) { + if !isInternalError(err) { t.Errorf("unexpected error %v", err) } } @@ -3139,8 +3145,8 @@ func TestControllerPublishBackoff(t *testing.T) { backoffId := driver.cs.errorBackoff.backoffId(testNodeID, testVolumeID) step := 1 * time.Millisecond - // Mock an active backoff condition on the node. - driver.cs.errorBackoff.next(backoffId) + // Mock an active bakcoff condition on the node. + driver.cs.errorBackoff.next(backoffId, codes.Unavailable) // A detach request for a different disk should succeed. As this disk is not // on the instance, the detach will succeed without calling the gce detach @@ -3213,13 +3219,16 @@ func TestControllerPublishBackoff(t *testing.T) { t.Errorf("expected error") } - // The above failure should cause driver to call Backoff.Next() again and a backoff duration of 400 ms duration is set starting at the 200th millisecond. - // For the 200-599 ms, the backoff condition is true, and new controller publish requests will be deined. + // The above failure should cause driver to call backoff.next() again and a + // backoff duration of 400 ms duration is set starting at the 200th + // millisecond. For the 200-599 ms, the backoff condition is true, with an + // internal error this time, and new controller publish requests will be + // denied. for i := 0; i < 399; i++ { tc.config.clock.Step(step) var err error _, err = driver.cs.ControllerPublishVolume(context.Background(), pubreq) - if !isUnavailableError(err) { + if !isInternalError(err) { t.Errorf("unexpected error %v", err) } } @@ -3363,3 +3372,16 @@ func isUnavailableError(err error) bool { return st.Code().String() == "Unavailable" } + +func isInternalError(err error) bool { + if err == nil { + return false + } + + st, ok := status.FromError(err) + if !ok { + return false + } + + return st.Code().String() == "Internal" +}