Skip to content

Commit 0af41c6

Browse files
committed
removed hyperdisk driver override, updated error messages and other pr suggestions
1 parent 06c1be4 commit 0af41c6

File tree

3 files changed

+59
-71
lines changed

3 files changed

+59
-71
lines changed

cmd/gce-pd-csi-driver/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func handle() {
101101
if *runControllerService && *httpEndpoint != "" {
102102
mm := metrics.NewMetricsManager()
103103
mm.InitializeHttpHandler(*httpEndpoint, *metricsPath)
104-
mm.RegisterHyperdiskMetric()
104+
mm.RegisterPDCSIMetric()
105105

106106
if metrics.IsGKEComponentVersionAvailable() {
107107
mm.EmitGKEComponentVersion()

pkg/gce-pd-csi-driver/controller.go

+39-52
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ const (
123123

124124
replicationTypeNone = "none"
125125
replicationTypeRegionalPD = "regional-pd"
126-
126+
diskNotFound = ""
127127
// The maximum number of entries that we can include in the
128128
// ListVolumesResposne
129129
// In reality, the limit here is 4MB (based on gRPC client response limits),
@@ -278,7 +278,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
278278
existingDisk, err := gceCS.CloudProvider.GetDisk(ctx, gceCS.CloudProvider.GetDefaultProject(), volKey, gceAPIVersion)
279279
if err != nil {
280280
if !gce.IsGCEError(err, "notFound") {
281-
return nil, common.LoggedError("CreateVolume unknown get disk error when validating: ", err)
281+
return nil, common.LoggedError("CreateVolume, failed to getDisk when validating: ", err)
282282
}
283283
}
284284
if err == nil {
@@ -334,7 +334,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
334334
if gce.IsGCEError(err, "notFound") {
335335
return nil, status.Errorf(codes.NotFound, "CreateVolume source volume %s does not exist", volumeContentSourceVolumeID)
336336
} else {
337-
return nil, common.LoggedError("CreateVolume unknown get disk error when validating: ", err)
337+
return nil, common.LoggedError("CreateVolume, getDisk error when validating: ", err)
338338
}
339339
}
340340

@@ -394,9 +394,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
394394
disk, err = createSingleZoneDisk(ctx, gceCS.CloudProvider, name, zones, params, capacityRange, capBytes, snapshotID, volumeContentSourceVolumeID, multiWriter)
395395
if err != nil {
396396
// Emit metric for expected disk type from storage class
397-
if params.DiskType != "" {
398-
gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, params.DiskType)
399-
}
397+
defer gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, params.DiskType)
400398
return nil, common.LoggedError("CreateVolume failed to create single zonal disk "+name+": ", err)
401399
}
402400
case replicationTypeRegionalPD:
@@ -406,9 +404,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
406404
disk, err = createRegionalDisk(ctx, gceCS.CloudProvider, name, zones, params, capacityRange, capBytes, snapshotID, volumeContentSourceVolumeID, multiWriter)
407405
if err != nil {
408406
// Emit metric for expected disk type from storage class
409-
if params.DiskType != "" {
410-
gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, params.DiskType)
411-
}
407+
defer gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, params.DiskType)
412408
return nil, common.LoggedError("CreateVolume failed to create regional disk "+name+": ", err)
413409
}
414410
default:
@@ -418,9 +414,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
418414
ready, err := isDiskReady(disk)
419415
if err != nil {
420416
// Emit metric for expected disk type from storage class as the disk is not ready and might not have PD type populated
421-
if params.DiskType != "" {
422-
gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, params.DiskType)
423-
}
417+
defer gceCS.Metrics.RecordOperationErrorMetrics("CreateVolume", err, params.DiskType)
424418
return nil, status.Errorf(codes.Internal, "CreateVolume disk %v had error checking ready status: %v", volKey, err.Error())
425419
}
426420
if !ready {
@@ -463,7 +457,7 @@ func (gceCS *GCEControllerServer) DeleteVolume(ctx context.Context, req *csi.Del
463457

464458
err = gceCS.CloudProvider.DeleteDisk(ctx, project, volKey)
465459
if err != nil {
466-
return nil, common.LoggedError("unknown Delete disk error: ", err)
460+
return nil, common.LoggedError("Failed to delete disk: ", err)
467461
}
468462

469463
klog.V(4).Infof("DeleteVolume succeeded for disk %v", volKey)
@@ -563,10 +557,11 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con
563557
defer gceCS.volumeLocks.Release(lockingVolumeID)
564558
diskToPublish, err := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1)
565559
if err != nil {
560+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerPublishVolume", err, diskNotFound)
566561
if gce.IsGCENotFoundError(err) {
567562
return nil, status.Errorf(codes.NotFound, "Could not find disk %v: %v", volKey.String(), err.Error())
568563
}
569-
return nil, status.Errorf(codes.Internal, "Unknown get disk error: %v", err.Error())
564+
return nil, status.Errorf(codes.Internal, "Failed to getDisk: %v", err.Error())
570565
}
571566
instanceZone, instanceName, err := common.NodeIDToZoneAndName(nodeID)
572567
if err != nil {
@@ -577,7 +572,7 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con
577572
if gce.IsGCENotFoundError(err) {
578573
return nil, status.Errorf(codes.NotFound, "Could not find instance %v: %v", nodeID, err.Error())
579574
}
580-
return nil, status.Errorf(codes.Internal, "Unknown get instance error: %v", err.Error())
575+
return nil, status.Errorf(codes.Internal, "Failed to get instance: %v", err.Error())
581576
}
582577

583578
readWrite := "READ_WRITE"
@@ -613,16 +608,17 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con
613608
return nil, status.Errorf(codes.InvalidArgument, "'%s' is not a compatible disk type with the machine type %s, please review the GCP online documentation for available persistent disk options", udErr.DiskType, machineType)
614609
}
615610
// Emit metric for error
616-
gceCS.Metrics.RecordOperationErrorMetrics("ControllerPublishVolume", err, diskToPublish.GetPDType())
617-
return nil, status.Errorf(codes.Internal, "unknown Attach error: %v", err.Error())
611+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerPublishVolume", err, metrics.GetDiskType(diskToPublish))
612+
return nil, status.Errorf(codes.Internal, "Failed to Attach: %v", err.Error())
618613
}
619614

620615
err = gceCS.CloudProvider.WaitForAttach(ctx, project, volKey, instanceZone, instanceName)
621616
if err != nil {
622617
// Emit metric for error
623-
gceCS.Metrics.RecordOperationErrorMetrics("ControllerPublishVolume", err, diskToPublish.GetPDType())
624-
return nil, status.Errorf(codes.Internal, "unknown WaitForAttach error: %v", err.Error())
618+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerPublishVolume", err, metrics.GetDiskType(diskToPublish))
619+
return nil, status.Errorf(codes.Internal, "Errored during WaitForAttach: %v", err.Error())
625620
}
621+
626622
klog.V(4).Infof("ControllerPublishVolume succeeded for disk %v to instance %v", volKey, nodeID)
627623
return pubVolResp, nil
628624
}
@@ -723,15 +719,13 @@ func (gceCS *GCEControllerServer) executeControllerUnpublishVolume(ctx context.C
723719
}
724720
diskToUnpublish, _ := gceCS.CloudProvider.GetDisk(ctx, gceCS.CloudProvider.GetDefaultProject(), volKey, gce.GCEAPIVersionV1)
725721
if err != nil {
726-
common.LoggedError("Unknown get disk error: ", err)
722+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerUnpublishVolume", err, diskNotFound)
723+
common.LoggedError("Failed to getDisk: ", err)
727724
}
728725
err = gceCS.CloudProvider.DetachDisk(ctx, project, deviceName, instanceZone, instanceName)
729726
if err != nil {
730-
//Do not emit metric if disk is unknown
731-
if diskToUnpublish != nil {
732-
gceCS.Metrics.RecordOperationErrorMetrics("ControllerUnpublishVolume", err, diskToUnpublish.GetPDType())
733-
}
734-
return nil, common.LoggedError("unknown detach error: ", err)
727+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerUnpublishVolume", err, metrics.GetDiskType(diskToUnpublish))
728+
return nil, common.LoggedError("Failed to detach: ", err)
735729
}
736730

737731
klog.V(4).Infof("ControllerUnpublishVolume succeeded for disk %v from node %v", volKey, nodeID)
@@ -769,7 +763,7 @@ func (gceCS *GCEControllerServer) ValidateVolumeCapabilities(ctx context.Context
769763
if gce.IsGCENotFoundError(err) {
770764
return nil, status.Errorf(codes.NotFound, "Could not find disk %v: %v", volKey.Name, err.Error())
771765
}
772-
return nil, common.LoggedError("Unknown get disk error: ", err)
766+
return nil, common.LoggedError("Failed to getDisk: ", err)
773767
}
774768

775769
// Check Volume Context is Empty
@@ -832,7 +826,7 @@ func (gceCS *GCEControllerServer) ListVolumes(ctx context.Context, req *csi.List
832826
if gce.IsGCEInvalidError(err) {
833827
return nil, status.Errorf(codes.Aborted, "ListVolumes error with invalid request: %v", err.Error())
834828
}
835-
return nil, common.LoggedError("Unknown list disk error: ", err)
829+
return nil, common.LoggedError("Failed to list disk: ", err)
836830
}
837831
gceCS.disks = diskList
838832
gceCS.seen = map[string]int{}
@@ -915,7 +909,7 @@ func (gceCS *GCEControllerServer) CreateSnapshot(ctx context.Context, req *csi.C
915909
if gce.IsGCENotFoundError(err) {
916910
return nil, status.Errorf(codes.NotFound, "CreateSnapshot could not find disk %v: %v", volKey.String(), err.Error())
917911
}
918-
return nil, common.LoggedError("CreateSnapshot unknown get disk error: ", err)
912+
return nil, common.LoggedError("CreateSnapshot, failed to getDisk: ", err)
919913
}
920914

921915
snapshotParams, err := common.ExtractAndDefaultSnapshotParameters(req.GetParameters(), gceCS.Driver.name)
@@ -950,35 +944,30 @@ func (gceCS *GCEControllerServer) createPDSnapshot(ctx context.Context, project
950944
}
951945
sourceDisk, err := gceCS.CloudProvider.GetDisk(ctx, gceCS.CloudProvider.GetDefaultProject(), volKey, gce.GCEAPIVersionV1)
952946
if err != nil {
953-
common.LoggedError("Unknown get disk error: ", err)
947+
common.LoggedError("Failed to getDisk: ", err)
954948
}
955949
// Check if PD snapshot already exists
956950
var snapshot *compute.Snapshot
957951
snapshot, err = gceCS.CloudProvider.GetSnapshot(ctx, project, snapshotName)
958952
if err != nil {
959953
if !gce.IsGCEError(err, "notFound") {
960-
return nil, status.Errorf(codes.Internal, "Unknown get snapshot error: %v", err.Error())
954+
return nil, status.Errorf(codes.Internal, "Failed to get snapshot: %v", err.Error())
961955
}
962956
// If we could not find the snapshot, we create a new one
963957
snapshot, err = gceCS.CloudProvider.CreateSnapshot(ctx, project, volKey, snapshotName, snapshotParams)
964958
if err != nil {
965959
if gce.IsGCEError(err, "notFound") {
960+
defer gceCS.Metrics.RecordOperationErrorMetrics("CreateSnapshot", err, diskNotFound)
966961
return nil, status.Errorf(codes.NotFound, "Could not find volume with ID %v: %v", volKey.String(), err.Error())
967962
}
968-
//Do not emit metric if disk is unknown
969-
if sourceDisk != nil {
970-
gceCS.Metrics.RecordOperationErrorMetrics("CreateSnapshot", err, sourceDisk.GetPDType())
971-
}
972-
return nil, common.LoggedError("Unknown create snapshot error: ", err)
963+
defer gceCS.Metrics.RecordOperationErrorMetrics("CreateSnapshot", err, metrics.GetDiskType(sourceDisk))
964+
return nil, common.LoggedError("Failed to create snapshot: ", err)
973965
}
974966
}
975967

976968
err = gceCS.validateExistingSnapshot(snapshot, volKey)
977969
if err != nil {
978-
//Do not emit metric if disk is unknown
979-
if sourceDisk != nil {
980-
gceCS.Metrics.RecordOperationErrorMetrics("CreateSnapshot", err, sourceDisk.GetPDType())
981-
}
970+
defer gceCS.Metrics.RecordOperationErrorMetrics("CreateSnapshot", err, metrics.GetDiskType(sourceDisk))
982971
return nil, status.Errorf(codes.AlreadyExists, "Error in creating snapshot: %v", err.Error())
983972
}
984973

@@ -1012,15 +1001,15 @@ func (gceCS *GCEControllerServer) createImage(ctx context.Context, project strin
10121001
image, err = gceCS.CloudProvider.GetImage(ctx, project, imageName)
10131002
if err != nil {
10141003
if !gce.IsGCEError(err, "notFound") {
1015-
return nil, common.LoggedError("Unknown get image error: ", err)
1004+
return nil, common.LoggedError("Failed to get image: ", err)
10161005
}
10171006
// create a new image
10181007
image, err = gceCS.CloudProvider.CreateImage(ctx, project, volKey, imageName, snapshotParams)
10191008
if err != nil {
10201009
if gce.IsGCEError(err, "notFound") {
10211010
return nil, status.Errorf(codes.NotFound, "Could not find volume with ID %v: %v", volKey.String(), err.Error())
10221011
}
1023-
return nil, common.LoggedError("Unknown create image error: ", err)
1012+
return nil, common.LoggedError("Failed to create image: ", err)
10241013
}
10251014
}
10261015

@@ -1150,12 +1139,12 @@ func (gceCS *GCEControllerServer) DeleteSnapshot(ctx context.Context, req *csi.D
11501139
case common.DiskSnapshotType:
11511140
err = gceCS.CloudProvider.DeleteSnapshot(ctx, project, key)
11521141
if err != nil {
1153-
return nil, common.LoggedError("unknown Delete snapshot error: ", err)
1142+
return nil, common.LoggedError("Failed to DeleteSnapshot: ", err)
11541143
}
11551144
case common.DiskImageType:
11561145
err = gceCS.CloudProvider.DeleteImage(ctx, project, key)
11571146
if err != nil {
1158-
return nil, common.LoggedError("unknown Delete image error: ", err)
1147+
return nil, common.LoggedError("Failed to DeleteImage error: ", err)
11591148
}
11601149
default:
11611150
return nil, status.Errorf(codes.InvalidArgument, "unknown snapshot type %s", snapshotType)
@@ -1187,7 +1176,7 @@ func (gceCS *GCEControllerServer) ListSnapshots(ctx context.Context, req *csi.Li
11871176
if gce.IsGCEInvalidError(err) {
11881177
return nil, status.Errorf(codes.Aborted, "ListSnapshots error with invalid request: %v", err.Error())
11891178
}
1190-
return nil, common.LoggedError("Unknown list snapshots error: ", err)
1179+
return nil, common.LoggedError("Failed to list snapshots: ", err)
11911180
}
11921181
gceCS.snapshots = snapshotList
11931182
gceCS.snapshotTokens = map[string]int{}
@@ -1233,21 +1222,19 @@ func (gceCS *GCEControllerServer) ControllerExpandVolume(ctx context.Context, re
12331222

12341223
project, volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, project, volKey)
12351224
if err != nil {
1225+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerExpandVolume", err, diskNotFound)
12361226
if gce.IsGCENotFoundError(err) {
12371227
return nil, status.Errorf(codes.NotFound, "ControllerExpandVolume could not find volume with ID %v: %v", volumeID, err.Error())
12381228
}
12391229
return nil, common.LoggedError("ControllerExpandVolume error repairing underspecified volume key: ", err)
12401230
}
12411231
sourceDisk, err := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1)
12421232
if err != nil {
1243-
common.LoggedError("Unknown get disk error: ", err)
1233+
common.LoggedError("Failed to getDisk: ", err)
12441234
}
12451235
resizedGb, err := gceCS.CloudProvider.ResizeDisk(ctx, project, volKey, reqBytes)
12461236
if err != nil {
1247-
//Do not emit metric if disk is unknown
1248-
if sourceDisk != nil {
1249-
gceCS.Metrics.RecordOperationErrorMetrics("ControllerExpandVolume", err, sourceDisk.GetPDType())
1250-
}
1237+
defer gceCS.Metrics.RecordOperationErrorMetrics("ControllerExpandVolume", err, metrics.GetDiskType(sourceDisk))
12511238
return nil, common.LoggedError("ControllerExpandVolume failed to resize disk: ", err)
12521239
}
12531240

@@ -1271,15 +1258,15 @@ func (gceCS *GCEControllerServer) getSnapshots(ctx context.Context, req *csi.Lis
12711258
if gce.IsGCEError(err, "invalid") {
12721259
return nil, status.Errorf(codes.Aborted, "Invalid error: %v", err.Error())
12731260
}
1274-
return nil, common.LoggedError("Unknown list snapshot error: ", err)
1261+
return nil, common.LoggedError("Failed to list snapshot: ", err)
12751262
}
12761263

12771264
images, _, err = gceCS.CloudProvider.ListImages(ctx, filter)
12781265
if err != nil {
12791266
if gce.IsGCEError(err, "invalid") {
12801267
return nil, status.Errorf(codes.Aborted, "Invalid error: %v", err.Error())
12811268
}
1282-
return nil, common.LoggedError("Unknown list image error: ", err)
1269+
return nil, common.LoggedError("Failed to list image: ", err)
12831270
}
12841271

12851272
entries := []*csi.ListSnapshotsResponse_Entry{}
@@ -1320,7 +1307,7 @@ func (gceCS *GCEControllerServer) getSnapshotByID(ctx context.Context, snapshotI
13201307
// return empty list if no snapshot is found
13211308
return &csi.ListSnapshotsResponse{}, nil
13221309
}
1323-
return nil, common.LoggedError("Unknown list snapshot error: ", err)
1310+
return nil, common.LoggedError("Failed to list snapshot: ", err)
13241311
}
13251312
e, err := generateDiskSnapshotEntry(snapshot)
13261313
if err != nil {

pkg/metrics/metrics.go

+19-18
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,18 @@ import (
2020
"fmt"
2121
"net/http"
2222
"os"
23-
"strings"
2423

2524
"k8s.io/component-base/metrics"
2625
"k8s.io/klog/v2"
2726
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
27+
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
2828
)
2929

3030
const (
3131
// envGKEPDCSIVersion is an environment variable set in the PDCSI controller manifest
3232
// with the current version of the GKE component.
33-
envGKEPDCSIVersion = "GKE_PDCSI_VERSION"
34-
hyperdiskDriverName = "hyperdisk.csi.storage.gke.io"
35-
pdcsiDriverName = "pd.csi.storage.gke.io"
33+
envGKEPDCSIVersion = "GKE_PDCSI_VERSION"
34+
pdcsiDriverName = "pd.csi.storage.gke.io"
3635
)
3736

3837
var (
@@ -42,15 +41,14 @@ var (
4241
Help: "Metric to expose the version of the PDCSI GKE component.",
4342
}, []string{"component_version"})
4443

45-
pdcsiOperationErrorsMetric = metrics.NewGaugeVec(
46-
&metrics.GaugeOpts{
44+
pdcsiOperationErrorsMetric = metrics.NewCounterVec(
45+
&metrics.CounterOpts{
4746
Subsystem: "csidriver",
48-
Name: "pdcsi_operation_errors",
47+
Name: "operation_errors",
4948
Help: "CSI server side error metrics",
5049
StabilityLevel: metrics.ALPHA,
5150
},
52-
[]string{"driver_name", "method_name", "grpc_status_code", "disk_type"},
53-
)
51+
[]string{"driver_name", "method_name", "grpc_status_code", "disk_type"})
5452
)
5553

5654
type MetricsManager struct {
@@ -72,7 +70,7 @@ func (mm *MetricsManager) registerComponentVersionMetric() {
7270
mm.registry.MustRegister(gkeComponentVersion)
7371
}
7472

75-
func (mm *MetricsManager) RegisterHyperdiskMetric() {
73+
func (mm *MetricsManager) RegisterPDCSIMetric() {
7674
mm.registry.MustRegister(pdcsiOperationErrorsMetric)
7775
}
7876

@@ -92,14 +90,7 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
9290
operationName string,
9391
operationErr error,
9492
diskType string) {
95-
var driverName string
96-
if strings.Contains(diskType, "hyperdisk") {
97-
driverName = hyperdiskDriverName
98-
}
99-
if strings.Contains(diskType, "pd") {
100-
driverName = pdcsiDriverName
101-
}
102-
pdcsiOperationErrorsMetric.WithLabelValues(driverName, "/csi.v1.Controller/"+operationName, common.CodeForError(operationErr).String(), diskType).Set(1.0)
93+
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, common.CodeForError(operationErr).String(), diskType).Inc()
10394
}
10495

10596
func (mm *MetricsManager) EmitGKEComponentVersion() error {
@@ -155,3 +146,13 @@ func IsGKEComponentVersionAvailable() bool {
155146

156147
return true
157148
}
149+
150+
func GetDiskType(disk *gce.CloudDisk) string {
151+
var diskType string
152+
if disk != nil {
153+
diskType = disk.GetPDType()
154+
} else {
155+
diskType = ""
156+
}
157+
return diskType
158+
}

0 commit comments

Comments
 (0)