Skip to content

Commit b0a7c7f

Browse files
committed
Record original error code to operation_errors metric for temporary errors
1 parent cb041f3 commit b0a7c7f

File tree

6 files changed

+187
-13
lines changed

6 files changed

+187
-13
lines changed

pkg/common/errors.go

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"google.golang.org/grpc/codes"
21+
"google.golang.org/grpc/status"
22+
)
23+
24+
// TemporaryError wraps an error with a temporary error code.
25+
// It implements the error interface. Do not return TemporaryError
26+
// directly from CSI Spec API calls, as CSI Spec API calls MUST
27+
// return a standard gRPC status. If TemporaryErrors are returned from
28+
// helper functions within a CSI Spec API method, make sure the outer CSI
29+
// Spec API method returns a standard gRPC status. (e.g. LoggedError(tempErr) )
30+
type TemporaryError struct {
31+
err error
32+
code codes.Code
33+
}
34+
35+
// Unwrap extracts the original error.
36+
func (te *TemporaryError) Unwrap() error {
37+
return te.err
38+
}
39+
40+
// GRPCStatus extracts the underlying gRPC Status error.
41+
// This method is necessary to fulfill the grpcstatus interface
42+
// described in https://pkg.go.dev/google.golang.org/grpc/status#FromError.
43+
// FromError is used in CodeForError to get existing error codes from status errors.
44+
func (te *TemporaryError) GRPCStatus() *status.Status {
45+
if te.err == nil {
46+
return status.New(codes.OK, "")
47+
}
48+
return status.New(te.code, te.err.Error())
49+
}
50+
51+
func NewTemporaryError(code codes.Code, err error) *TemporaryError {
52+
return &TemporaryError{err: err, code: code}
53+
}
54+
55+
// Error returns a readable representation of the TemporaryError.
56+
func (te *TemporaryError) Error() string {
57+
return te.err.Error()
58+
}

pkg/common/utils_test.go

+20
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,26 @@ func TestCodeForError(t *testing.T) {
10211021
inputErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
10221022
expCode: codes.InvalidArgument,
10231023
},
1024+
{
1025+
name: "TemporaryError that wraps googleapi error",
1026+
inputErr: &TemporaryError{code: codes.Unavailable, err: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}},
1027+
expCode: codes.Unavailable,
1028+
},
1029+
{
1030+
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
1031+
inputErr: &TemporaryError{code: codes.Aborted, err: fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})},
1032+
expCode: codes.Aborted,
1033+
},
1034+
{
1035+
name: "TemporaryError that wraps status error",
1036+
inputErr: &TemporaryError{code: codes.Aborted, err: status.Error(codes.Aborted, "aborted error")},
1037+
expCode: codes.Aborted,
1038+
},
1039+
{
1040+
name: "TemporaryError that wraps context canceled error",
1041+
inputErr: &TemporaryError{code: codes.Aborted, err: context.Canceled},
1042+
expCode: codes.Aborted,
1043+
},
10241044
}
10251045

10261046
for _, tc := range testCases {

pkg/gce-cloud-provider/compute/gce-compute.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
562562
if err != nil {
563563
// failed to GetDisk, however the Disk may already exist
564564
// the error code should be non-Final
565-
return status.Error(codes.Unavailable, err.Error())
565+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
566566
}
567567
err = cloud.ValidateExistingDisk(ctx, disk, params,
568568
int64(capacityRange.GetRequiredBytes()),
@@ -586,7 +586,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
586586
if IsGCEError(err, "alreadyExists") {
587587
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
588588
if err != nil {
589-
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
589+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
590590
}
591591
err = cloud.ValidateExistingDisk(ctx, disk, params,
592592
int64(capacityRange.GetRequiredBytes()),
@@ -598,7 +598,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
598598
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
599599
return nil
600600
}
601-
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
601+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
602602
}
603603
return nil
604604
}
@@ -702,7 +702,7 @@ func (cloud *CloudProvider) insertZonalDisk(
702702
if err != nil {
703703
// failed to GetDisk, however the Disk may already exist
704704
// the error code should be non-Final
705-
return status.Error(codes.Unavailable, err.Error())
705+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
706706
}
707707
err = cloud.ValidateExistingDisk(ctx, disk, params,
708708
int64(capacityRange.GetRequiredBytes()),
@@ -727,7 +727,7 @@ func (cloud *CloudProvider) insertZonalDisk(
727727
if IsGCEError(err, "alreadyExists") {
728728
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
729729
if err != nil {
730-
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
730+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
731731
}
732732
err = cloud.ValidateExistingDisk(ctx, disk, params,
733733
int64(capacityRange.GetRequiredBytes()),
@@ -739,7 +739,7 @@ func (cloud *CloudProvider) insertZonalDisk(
739739
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
740740
return nil
741741
}
742-
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
742+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
743743
}
744744
return nil
745745
}

pkg/gce-pd-csi-driver/controller.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -2001,7 +2001,7 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name
20012001
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
20022002
disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion)
20032003
if err != nil {
2004-
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error())
2004+
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating regional disk: %w", err))
20052005
}
20062006
return disk, nil
20072007
}
@@ -2024,7 +2024,7 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam
20242024
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
20252025
disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion)
20262026
if err != nil {
2027-
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error())
2027+
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating zonal disk: %w", err))
20282028
}
20292029
return disk, nil
20302030
}

pkg/metrics/metrics.go

+19-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"errors"
2021
"fmt"
2122
"net/http"
2223
"os"
@@ -97,11 +98,9 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
9798
diskType string,
9899
enableConfidentialStorage string,
99100
enableStoragePools string) {
100-
err := codes.OK.String()
101-
if operationErr != nil {
102-
err = common.CodeForError(operationErr).String()
103-
}
104-
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, err, diskType, enableConfidentialStorage, enableStoragePools).Inc()
101+
errCode := errorCodeLabelValue(operationErr)
102+
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc()
103+
klog.Infof("Recorded PDCSI operation error code: %q", errCode)
105104
}
106105

107106
func (mm *MetricsManager) EmitGKEComponentVersion() error {
@@ -169,3 +168,18 @@ func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) {
169168
}
170169
return diskType, enableConfidentialStorage, enableStoragePools
171170
}
171+
172+
// errorCodeLabelValue returns the label value for the given operation error.
173+
// This was separated into a helper function for unit testing purposes.
174+
func errorCodeLabelValue(operationErr error) string {
175+
err := codes.OK.String()
176+
if operationErr != nil {
177+
// If the operationErr is a TemporaryError, unwrap the temporary error before passing it to CodeForError.
178+
var tempErr *common.TemporaryError
179+
if errors.As(operationErr, &tempErr) {
180+
operationErr = tempErr.Unwrap()
181+
}
182+
err = common.CodeForError(operationErr).String()
183+
}
184+
return err
185+
}

pkg/metrics/metrics_test.go

+82
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,21 @@ limitations under the License.
1818
package metrics
1919

2020
import (
21+
"context"
22+
"errors"
23+
"fmt"
24+
"net/http"
2125
"testing"
2226

27+
"github.com/google/go-cmp/cmp"
2328
computealpha "google.golang.org/api/compute/v0.alpha"
2429
computebeta "google.golang.org/api/compute/v0.beta"
2530
"google.golang.org/api/compute/v1"
31+
"google.golang.org/api/googleapi"
32+
"google.golang.org/grpc/codes"
33+
"google.golang.org/grpc/status"
34+
35+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
2636
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
2737
)
2838

@@ -115,3 +125,75 @@ func TestGetMetricParameters(t *testing.T) {
115125
}
116126
}
117127
}
128+
129+
func TestErrorCodeLabelValue(t *testing.T) {
130+
testCases := []struct {
131+
name string
132+
operationErr error
133+
wantErrorCode string
134+
}{
135+
{
136+
name: "Not googleapi.Error",
137+
operationErr: errors.New("I am not a googleapi.Error"),
138+
wantErrorCode: "Internal",
139+
},
140+
{
141+
name: "User error",
142+
operationErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
143+
wantErrorCode: "InvalidArgument",
144+
},
145+
{
146+
name: "googleapi.Error but not a user error",
147+
operationErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
148+
wantErrorCode: "Internal",
149+
},
150+
{
151+
name: "context canceled error",
152+
operationErr: context.Canceled,
153+
wantErrorCode: "Canceled",
154+
},
155+
{
156+
name: "context deadline exceeded error",
157+
operationErr: context.DeadlineExceeded,
158+
wantErrorCode: "DeadlineExceeded",
159+
},
160+
{
161+
name: "status error with Aborted error code",
162+
operationErr: status.Error(codes.Aborted, "aborted error"),
163+
wantErrorCode: "Aborted",
164+
},
165+
{
166+
name: "user multiattach error",
167+
operationErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
168+
wantErrorCode: "InvalidArgument",
169+
},
170+
{
171+
name: "TemporaryError that wraps googleapi error",
172+
operationErr: common.NewTemporaryError(codes.Unavailable, &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}),
173+
wantErrorCode: "InvalidArgument",
174+
},
175+
{
176+
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
177+
operationErr: common.NewTemporaryError(codes.Aborted, fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})),
178+
wantErrorCode: "InvalidArgument",
179+
},
180+
{
181+
name: "TemporaryError that wraps status error",
182+
operationErr: common.NewTemporaryError(codes.Aborted, status.Error(codes.InvalidArgument, "User error with bad request")),
183+
wantErrorCode: "InvalidArgument",
184+
},
185+
{
186+
name: "TemporaryError that wraps multiattach error",
187+
operationErr: common.NewTemporaryError(codes.Unavailable, fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'")),
188+
wantErrorCode: "InvalidArgument",
189+
},
190+
}
191+
192+
for _, tc := range testCases {
193+
t.Logf("Running test: %v", tc.name)
194+
errCode := errorCodeLabelValue(tc.operationErr)
195+
if diff := cmp.Diff(tc.wantErrorCode, errCode); diff != "" {
196+
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
197+
}
198+
}
199+
}

0 commit comments

Comments
 (0)