Skip to content

Commit ad9eaa4

Browse files
committed
Record original error code to operation_errors metric for temporary errors
1 parent cb041f3 commit ad9eaa4

File tree

8 files changed

+264
-13
lines changed

8 files changed

+264
-13
lines changed

pkg/common/errors.go

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"google.golang.org/grpc/codes"
21+
)
22+
23+
// TemporaryError wraps an error that should be returned to CSI Provisioner
24+
// with the temporary Error code. It implements the error interface.
25+
type TemporaryError struct {
26+
err error
27+
code codes.Code
28+
}
29+
30+
// Unwrap extracts the original error.
31+
func (te *TemporaryError) Unwrap() error {
32+
return te.err
33+
}
34+
35+
func (te *TemporaryError) Code() codes.Code {
36+
return te.code
37+
}
38+
39+
func NewTemporaryError(code codes.Code, err error) *TemporaryError {
40+
return &TemporaryError{err: err, code: code}
41+
}
42+
43+
// Error returns a readable representation of the TemporaryError.
44+
func (te *TemporaryError) Error() string {
45+
return te.err.Error()
46+
}

pkg/common/utils.go

+6
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,12 @@ func existingErrorCode(err error) (codes.Code, error) {
400400

401401
func LoggedError(msg string, err error) error {
402402
klog.Errorf(msg+"%v", err.Error())
403+
404+
// If err is a temporary error, return the temporary error code back to the csi provisioner.
405+
var tempErr *TemporaryError
406+
if errors.As(err, &tempErr) {
407+
return status.Errorf(tempErr.Code(), msg+"%v", tempErr.Error())
408+
}
403409
return status.Errorf(CodeForError(err), msg+"%v", err.Error())
404410
}
405411

pkg/common/utils_test.go

+69
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,75 @@ func TestCodeForError(t *testing.T) {
10311031
}
10321032
}
10331033

1034+
func TestLoggedError(t *testing.T) {
1035+
testCases := []struct {
1036+
name string
1037+
inputErr error
1038+
msg string
1039+
expErrorStr string
1040+
}{
1041+
{
1042+
name: "Not googleapi.Error",
1043+
msg: "error found: ",
1044+
inputErr: errors.New("I am not a googleapi.Error"),
1045+
expErrorStr: "rpc error: code = Internal desc = error found: I am not a googleapi.Error",
1046+
},
1047+
{
1048+
name: "User error",
1049+
inputErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
1050+
expErrorStr: "rpc error: code = InvalidArgument desc = googleapi: Error 400: User error with bad request",
1051+
},
1052+
{
1053+
name: "googleapi.Error but not a user error",
1054+
inputErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
1055+
expErrorStr: "rpc error: code = Internal desc = googleapi: Error 500: Internal error",
1056+
},
1057+
{
1058+
name: "context canceled error",
1059+
inputErr: context.Canceled,
1060+
expErrorStr: "rpc error: code = Canceled desc = context canceled",
1061+
},
1062+
{
1063+
name: "context deadline exceeded error",
1064+
inputErr: context.DeadlineExceeded,
1065+
expErrorStr: "rpc error: code = DeadlineExceeded desc = context deadline exceeded",
1066+
},
1067+
{
1068+
name: "status error with Aborted error code",
1069+
inputErr: status.Error(codes.Aborted, "aborted error"),
1070+
expErrorStr: "rpc error: code = Aborted desc = rpc error: code = Aborted desc = aborted error",
1071+
},
1072+
{
1073+
name: "user multiattach error",
1074+
inputErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
1075+
expErrorStr: "rpc error: code = InvalidArgument desc = The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'",
1076+
},
1077+
{
1078+
name: "TemporaryError that wraps googleapi error",
1079+
inputErr: &TemporaryError{code: codes.Unavailable, err: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}},
1080+
expErrorStr: "rpc error: code = Unavailable desc = googleapi: Error 400: User error with bad request",
1081+
},
1082+
{
1083+
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
1084+
inputErr: &TemporaryError{code: codes.Aborted, err: fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})},
1085+
expErrorStr: "rpc error: code = Aborted desc = got error: googleapi: Error 400: User error with bad request",
1086+
},
1087+
{
1088+
name: "TemporaryError that wraps status error",
1089+
inputErr: &TemporaryError{code: codes.Aborted, err: status.Error(codes.Aborted, "aborted error")},
1090+
expErrorStr: "rpc error: code = Aborted desc = rpc error: code = Aborted desc = aborted error",
1091+
},
1092+
}
1093+
1094+
for _, tc := range testCases {
1095+
err := LoggedError(tc.msg, tc.inputErr)
1096+
// Validate the error returned is a status error.
1097+
if diff := cmp.Diff(tc.expErrorStr, err.Error()); diff != "" {
1098+
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
1099+
}
1100+
}
1101+
}
1102+
10341103
func TestIsContextError(t *testing.T) {
10351104
cases := []struct {
10361105
name string

pkg/gce-cloud-provider/compute/gce-compute.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
562562
if err != nil {
563563
// failed to GetDisk, however the Disk may already exist
564564
// the error code should be non-Final
565-
return status.Error(codes.Unavailable, err.Error())
565+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
566566
}
567567
err = cloud.ValidateExistingDisk(ctx, disk, params,
568568
int64(capacityRange.GetRequiredBytes()),
@@ -586,7 +586,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
586586
if IsGCEError(err, "alreadyExists") {
587587
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
588588
if err != nil {
589-
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
589+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
590590
}
591591
err = cloud.ValidateExistingDisk(ctx, disk, params,
592592
int64(capacityRange.GetRequiredBytes()),
@@ -598,7 +598,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
598598
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
599599
return nil
600600
}
601-
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
601+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
602602
}
603603
return nil
604604
}
@@ -702,7 +702,7 @@ func (cloud *CloudProvider) insertZonalDisk(
702702
if err != nil {
703703
// failed to GetDisk, however the Disk may already exist
704704
// the error code should be non-Final
705-
return status.Error(codes.Unavailable, err.Error())
705+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
706706
}
707707
err = cloud.ValidateExistingDisk(ctx, disk, params,
708708
int64(capacityRange.GetRequiredBytes()),
@@ -727,7 +727,7 @@ func (cloud *CloudProvider) insertZonalDisk(
727727
if IsGCEError(err, "alreadyExists") {
728728
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
729729
if err != nil {
730-
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
730+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
731731
}
732732
err = cloud.ValidateExistingDisk(ctx, disk, params,
733733
int64(capacityRange.GetRequiredBytes()),
@@ -739,7 +739,7 @@ func (cloud *CloudProvider) insertZonalDisk(
739739
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
740740
return nil
741741
}
742-
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
742+
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
743743
}
744744
return nil
745745
}

pkg/gce-pd-csi-driver/controller.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -2001,7 +2001,7 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name
20012001
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
20022002
disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion)
20032003
if err != nil {
2004-
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error())
2004+
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating regional disk: %w", err))
20052005
}
20062006
return disk, nil
20072007
}
@@ -2024,7 +2024,7 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam
20242024
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
20252025
disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion)
20262026
if err != nil {
2027-
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error())
2027+
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating zonal disk: %w", err))
20282028
}
20292029
return disk, nil
20302030
}

pkg/metrics/metrics.go

+19-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"errors"
2021
"fmt"
2122
"net/http"
2223
"os"
@@ -97,11 +98,9 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
9798
diskType string,
9899
enableConfidentialStorage string,
99100
enableStoragePools string) {
100-
err := codes.OK.String()
101-
if operationErr != nil {
102-
err = common.CodeForError(operationErr).String()
103-
}
104-
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, err, diskType, enableConfidentialStorage, enableStoragePools).Inc()
101+
errCode := errorCodeLabelValue(operationErr)
102+
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc()
103+
klog.Infof("Recorded PDCSI operation error code: %q", errCode)
105104
}
106105

107106
func (mm *MetricsManager) EmitGKEComponentVersion() error {
@@ -169,3 +168,18 @@ func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) {
169168
}
170169
return diskType, enableConfidentialStorage, enableStoragePools
171170
}
171+
172+
// errorCodeLabelValue returns the label value for the given operation error.
173+
// This was separated into a helper function for unit testing purposes.
174+
func errorCodeLabelValue(operationErr error) string {
175+
err := codes.OK.String()
176+
if operationErr != nil {
177+
// If the operationErr is a TemporaryError, unwrap the temporary error before passing it to CodeForError.
178+
var tempErr *common.TemporaryError
179+
if errors.As(operationErr, &tempErr) {
180+
operationErr = tempErr.Unwrap()
181+
}
182+
err = common.CodeForError(operationErr).String()
183+
}
184+
return err
185+
}

pkg/metrics/metrics_test.go

+77
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,21 @@ limitations under the License.
1818
package metrics
1919

2020
import (
21+
"context"
22+
"errors"
23+
"fmt"
24+
"net/http"
2125
"testing"
2226

27+
"github.com/google/go-cmp/cmp"
2328
computealpha "google.golang.org/api/compute/v0.alpha"
2429
computebeta "google.golang.org/api/compute/v0.beta"
2530
"google.golang.org/api/compute/v1"
31+
"google.golang.org/api/googleapi"
32+
"google.golang.org/grpc/codes"
33+
"google.golang.org/grpc/status"
34+
35+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
2636
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
2737
)
2838

@@ -115,3 +125,70 @@ func TestGetMetricParameters(t *testing.T) {
115125
}
116126
}
117127
}
128+
129+
func TestErrorCodeLabelValue(t *testing.T) {
130+
testCases := []struct {
131+
name string
132+
operationErr error
133+
wantErrorCode string
134+
}{
135+
{
136+
name: "Not googleapi.Error",
137+
operationErr: errors.New("I am not a googleapi.Error"),
138+
wantErrorCode: "Internal",
139+
},
140+
{
141+
name: "User error",
142+
operationErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
143+
wantErrorCode: "InvalidArgument",
144+
},
145+
{
146+
name: "googleapi.Error but not a user error",
147+
operationErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
148+
wantErrorCode: "Internal",
149+
},
150+
{
151+
name: "context canceled error",
152+
operationErr: context.Canceled,
153+
wantErrorCode: "Canceled",
154+
},
155+
{
156+
name: "context deadline exceeded error",
157+
operationErr: context.DeadlineExceeded,
158+
wantErrorCode: "DeadlineExceeded",
159+
},
160+
{
161+
name: "status error with Aborted error code",
162+
operationErr: status.Error(codes.Aborted, "aborted error"),
163+
wantErrorCode: "Aborted",
164+
},
165+
{
166+
name: "user multiattach error",
167+
operationErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
168+
wantErrorCode: "InvalidArgument",
169+
},
170+
{
171+
name: "TemporaryError that wraps googleapi error",
172+
operationErr: common.NewTemporaryError(codes.Unavailable, &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}),
173+
wantErrorCode: "InvalidArgument",
174+
},
175+
{
176+
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
177+
operationErr: common.NewTemporaryError(codes.Aborted, fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})),
178+
wantErrorCode: "InvalidArgument",
179+
},
180+
{
181+
name: "TemporaryError that wraps status error",
182+
operationErr: common.NewTemporaryError(codes.Aborted, status.Error(codes.InvalidArgument, "User error with bad request")),
183+
wantErrorCode: "InvalidArgument",
184+
},
185+
}
186+
187+
for _, tc := range testCases {
188+
t.Logf("Running test: %v", tc.name)
189+
errCode := errorCodeLabelValue(tc.operationErr)
190+
if diff := cmp.Diff(tc.wantErrorCode, errCode); diff != "" {
191+
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
192+
}
193+
}
194+
}

test/e2e/tests/single_zone_e2e_test.go

+39
Original file line numberDiff line numberDiff line change
@@ -1449,6 +1449,45 @@ var _ = Describe("GCE PD CSI Driver", func() {
14491449
Entry("with missing multi-zone label", multiZoneTestConfig{diskType: standardDiskType, readOnly: true, hasMultiZoneLabel: false, wantErrSubstring: "points to disk that is missing label \"goog-gke-multi-zone\""}),
14501450
Entry("with unsupported disk-type pd-extreme", multiZoneTestConfig{diskType: extremeDiskType, readOnly: true, hasMultiZoneLabel: true, wantErrSubstring: "points to disk with unsupported disk type"}),
14511451
)
1452+
1453+
It("Should return temporary error code when error is returned from polling insertDisk operation", func() {
1454+
Expect(testContexts).ToNot(BeEmpty())
1455+
testContext := getRandomTestContext()
1456+
1457+
controllerInstance := testContext.Instance
1458+
controllerClient := testContext.Client
1459+
1460+
p, z, _ := controllerInstance.GetIdentity()
1461+
1462+
// Create Source Disk
1463+
_, srcVolID := createAndValidateUniqueZonalDisk(controllerClient, p, z, standardDiskType)
1464+
1465+
topReq := &csi.TopologyRequirement{
1466+
Requisite: []*csi.Topology{
1467+
{
1468+
Segments: map[string]string{common.TopologyKeyZone: z},
1469+
},
1470+
},
1471+
}
1472+
volContentSrc := &csi.VolumeContentSource{
1473+
Type: &csi.VolumeContentSource_Volume{
1474+
Volume: &csi.VolumeContentSource_VolumeSource{
1475+
VolumeId: srcVolID,
1476+
},
1477+
},
1478+
}
1479+
// Create 2 disk clones within 30 seconds, to violate the GCE restriction:
1480+
// 'You can create at most one clone of a given source disk or its clones every 30 seconds.'
1481+
// https://cloud.google.com/compute/docs/disks/create-disk-from-source#restrictions
1482+
volName1 := testNamePrefix + string(uuid.NewUUID())
1483+
volName2 := testNamePrefix + string(uuid.NewUUID())
1484+
_, err1 := controllerClient.CreateVolume(volName1, nil, defaultSizeGb, topReq, volContentSrc)
1485+
_, err2 := controllerClient.CreateVolume(volName2, nil, defaultSizeGb, topReq, volContentSrc)
1486+
1487+
Expect(err1).To(BeNil(), "CreateVolume failed with error: %v", err1)
1488+
Expect(err2).ToNot(BeNil(), "Unexpected success")
1489+
Expect(err2.Error()).To(ContainSubstring("Unavailable"))
1490+
})
14521491
})
14531492

14541493
func equalWithinEpsilon(a, b, epsiolon int64) bool {

0 commit comments

Comments
 (0)