Skip to content

Commit 85b2f6f

Browse files
authored
Merge pull request #1672 from k8s-infra-cherrypick-robot/cherry-pick-1664-to-release-1.13
[release-1.13] Record original error code to operation_errors metric for temporary errors
2 parents cfe53f7 + e3693c6 commit 85b2f6f

File tree

6 files changed

+187
-13
lines changed

6 files changed

+187
-13
lines changed

pkg/common/errors.go

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"google.golang.org/grpc/codes"
21+
"google.golang.org/grpc/status"
22+
)
23+
24+
// TemporaryError wraps an error with a temporary error code.
25+
// It implements the error interface. Do not return TemporaryError
26+
// directly from CSI Spec API calls, as CSI Spec API calls MUST
27+
// return a standard gRPC status. If TemporaryErrors are returned from
28+
// helper functions within a CSI Spec API method, make sure the outer CSI
29+
// Spec API method returns a standard gRPC status. (e.g. LoggedError(tempErr) )
30+
type TemporaryError struct {
31+
err error
32+
code codes.Code
33+
}
34+
35+
// Unwrap extracts the original error.
36+
func (te *TemporaryError) Unwrap() error {
37+
return te.err
38+
}
39+
40+
// GRPCStatus extracts the underlying gRPC Status error.
41+
// This method is necessary to fulfill the grpcstatus interface
42+
// described in https://pkg.go.dev/google.golang.org/grpc/status#FromError.
43+
// FromError is used in CodeForError to get existing error codes from status errors.
44+
func (te *TemporaryError) GRPCStatus() *status.Status {
45+
if te.err == nil {
46+
return status.New(codes.OK, "")
47+
}
48+
return status.New(te.code, te.err.Error())
49+
}
50+
51+
func NewTemporaryError(code codes.Code, err error) *TemporaryError {
52+
return &TemporaryError{err: err, code: code}
53+
}
54+
55+
// Error returns a readable representation of the TemporaryError.
56+
func (te *TemporaryError) Error() string {
57+
return te.err.Error()
58+
}

pkg/common/utils_test.go

+20
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,26 @@ func TestCodeForError(t *testing.T) {
10211021
inputErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
10221022
expCode: codes.InvalidArgument,
10231023
},
1024+
{
1025+
name: "TemporaryError that wraps googleapi error",
1026+
inputErr: &TemporaryError{code: codes.Unavailable, err: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}},
1027+
expCode: codes.Unavailable,
1028+
},
1029+
{
1030+
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
1031+
inputErr: &TemporaryError{code: codes.Aborted, err: fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})},
1032+
expCode: codes.Aborted,
1033+
},
1034+
{
1035+
name: "TemporaryError that wraps status error",
1036+
inputErr: &TemporaryError{code: codes.Aborted, err: status.Error(codes.Aborted, "aborted error")},
1037+
expCode: codes.Aborted,
1038+
},
1039+
{
1040+
name: "TemporaryError that wraps context canceled error",
1041+
inputErr: &TemporaryError{code: codes.Aborted, err: context.Canceled},
1042+
expCode: codes.Aborted,
1043+
},
10241044
}
10251045

10261046
for _, tc := range testCases {

pkg/gce-cloud-provider/compute/gce-compute.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
522522
if err != nil {
523523
// failed to GetDisk, however the Disk may already exist
524524
// the error code should be non-Final
525-
return status.Error(codes.Unavailable, err.Error())
525+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
526526
}
527527
err = cloud.ValidateExistingDisk(ctx, disk, params,
528528
int64(capacityRange.GetRequiredBytes()),
@@ -546,7 +546,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
546546
if IsGCEError(err, "alreadyExists") {
547547
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
548548
if err != nil {
549-
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
549+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
550550
}
551551
err = cloud.ValidateExistingDisk(ctx, disk, params,
552552
int64(capacityRange.GetRequiredBytes()),
@@ -558,7 +558,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
558558
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
559559
return nil
560560
}
561-
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
561+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
562562
}
563563
return nil
564564
}
@@ -653,7 +653,7 @@ func (cloud *CloudProvider) insertZonalDisk(
653653
if err != nil {
654654
// failed to GetDisk, however the Disk may already exist
655655
// the error code should be non-Final
656-
return status.Error(codes.Unavailable, err.Error())
656+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
657657
}
658658
err = cloud.ValidateExistingDisk(ctx, disk, params,
659659
int64(capacityRange.GetRequiredBytes()),
@@ -678,7 +678,7 @@ func (cloud *CloudProvider) insertZonalDisk(
678678
if IsGCEError(err, "alreadyExists") {
679679
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
680680
if err != nil {
681-
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
681+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
682682
}
683683
err = cloud.ValidateExistingDisk(ctx, disk, params,
684684
int64(capacityRange.GetRequiredBytes()),
@@ -690,7 +690,7 @@ func (cloud *CloudProvider) insertZonalDisk(
690690
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
691691
return nil
692692
}
693-
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
693+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
694694
}
695695
return nil
696696
}

pkg/gce-pd-csi-driver/controller.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -2001,7 +2001,7 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name
20012001
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
20022002
disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion)
20032003
if err != nil {
2004-
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error())
2004+
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating regional disk: %w", err))
20052005
}
20062006
return disk, nil
20072007
}
@@ -2024,7 +2024,7 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam
20242024
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
20252025
disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion)
20262026
if err != nil {
2027-
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error())
2027+
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating zonal disk: %w", err))
20282028
}
20292029
return disk, nil
20302030
}

pkg/metrics/metrics.go

+19-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"errors"
2021
"fmt"
2122
"net/http"
2223
"os"
@@ -97,11 +98,9 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
9798
diskType string,
9899
enableConfidentialStorage string,
99100
enableStoragePools string) {
100-
err := codes.OK.String()
101-
if operationErr != nil {
102-
err = common.CodeForError(operationErr).String()
103-
}
104-
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, err, diskType, enableConfidentialStorage, enableStoragePools).Inc()
101+
errCode := errorCodeLabelValue(operationErr)
102+
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc()
103+
klog.Infof("Recorded PDCSI operation error code: %q", errCode)
105104
}
106105

107106
func (mm *MetricsManager) EmitGKEComponentVersion() error {
@@ -169,3 +168,18 @@ func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) {
169168
}
170169
return diskType, enableConfidentialStorage, enableStoragePools
171170
}
171+
172+
// errorCodeLabelValue returns the label value for the given operation error.
173+
// This was separated into a helper function for unit testing purposes.
174+
func errorCodeLabelValue(operationErr error) string {
175+
err := codes.OK.String()
176+
if operationErr != nil {
177+
// If the operationErr is a TemporaryError, unwrap the temporary error before passing it to CodeForError.
178+
var tempErr *common.TemporaryError
179+
if errors.As(operationErr, &tempErr) {
180+
operationErr = tempErr.Unwrap()
181+
}
182+
err = common.CodeForError(operationErr).String()
183+
}
184+
return err
185+
}

pkg/metrics/metrics_test.go

+82
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,19 @@ limitations under the License.
1818
package metrics
1919

2020
import (
21+
"context"
22+
"errors"
23+
"fmt"
24+
"net/http"
2125
"testing"
2226

27+
"github.com/google/go-cmp/cmp"
2328
"google.golang.org/api/compute/v1"
29+
"google.golang.org/api/googleapi"
30+
"google.golang.org/grpc/codes"
31+
"google.golang.org/grpc/status"
32+
33+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
2434
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
2535
)
2636

@@ -101,3 +111,75 @@ func TestGetMetricParameters(t *testing.T) {
101111
}
102112
}
103113
}
114+
115+
func TestErrorCodeLabelValue(t *testing.T) {
116+
testCases := []struct {
117+
name string
118+
operationErr error
119+
wantErrorCode string
120+
}{
121+
{
122+
name: "Not googleapi.Error",
123+
operationErr: errors.New("I am not a googleapi.Error"),
124+
wantErrorCode: "Internal",
125+
},
126+
{
127+
name: "User error",
128+
operationErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
129+
wantErrorCode: "InvalidArgument",
130+
},
131+
{
132+
name: "googleapi.Error but not a user error",
133+
operationErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
134+
wantErrorCode: "Internal",
135+
},
136+
{
137+
name: "context canceled error",
138+
operationErr: context.Canceled,
139+
wantErrorCode: "Canceled",
140+
},
141+
{
142+
name: "context deadline exceeded error",
143+
operationErr: context.DeadlineExceeded,
144+
wantErrorCode: "DeadlineExceeded",
145+
},
146+
{
147+
name: "status error with Aborted error code",
148+
operationErr: status.Error(codes.Aborted, "aborted error"),
149+
wantErrorCode: "Aborted",
150+
},
151+
{
152+
name: "user multiattach error",
153+
operationErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
154+
wantErrorCode: "InvalidArgument",
155+
},
156+
{
157+
name: "TemporaryError that wraps googleapi error",
158+
operationErr: common.NewTemporaryError(codes.Unavailable, &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}),
159+
wantErrorCode: "InvalidArgument",
160+
},
161+
{
162+
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
163+
operationErr: common.NewTemporaryError(codes.Aborted, fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})),
164+
wantErrorCode: "InvalidArgument",
165+
},
166+
{
167+
name: "TemporaryError that wraps status error",
168+
operationErr: common.NewTemporaryError(codes.Aborted, status.Error(codes.InvalidArgument, "User error with bad request")),
169+
wantErrorCode: "InvalidArgument",
170+
},
171+
{
172+
name: "TemporaryError that wraps multiattach error",
173+
operationErr: common.NewTemporaryError(codes.Unavailable, fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'")),
174+
wantErrorCode: "InvalidArgument",
175+
},
176+
}
177+
178+
for _, tc := range testCases {
179+
t.Logf("Running test: %v", tc.name)
180+
errCode := errorCodeLabelValue(tc.operationErr)
181+
if diff := cmp.Diff(tc.wantErrorCode, errCode); diff != "" {
182+
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
183+
}
184+
}
185+
}

0 commit comments

Comments
 (0)