Skip to content

Commit c0695cb

Browse files
authored
Merge pull request #1952 from hajiler/mount-error-metrics-branch
Introduce metrics for pdcsi node
2 parents 84af6e9 + 9e68dea commit c0695cb

File tree

6 files changed

+97
-5
lines changed

6 files changed

+97
-5
lines changed

cmd/gce-pd-csi-driver/main.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,13 +151,22 @@ func handle() {
151151
}
152152

153153
var metricsManager *metrics.MetricsManager = nil
154-
if *runControllerService && *httpEndpoint != "" {
154+
runServiceWithMetrics := *runControllerService || *runNodeService
155+
if runServiceWithMetrics && *httpEndpoint != "" {
155156
mm := metrics.NewMetricsManager()
156157
mm.InitializeHttpHandler(*httpEndpoint, *metricsPath)
157-
mm.RegisterPDCSIMetric()
158158

159-
if metrics.IsGKEComponentVersionAvailable() {
160-
mm.EmitGKEComponentVersion()
159+
switch {
160+
case *runControllerService:
161+
mm.RegisterPDCSIMetric()
162+
if metrics.IsGKEComponentVersionAvailable() {
163+
mm.EmitGKEComponentVersion()
164+
}
165+
case *runNodeService:
166+
if err := mm.EmmitProcessStartTime(); err != nil {
167+
klog.Errorf("Failed to emit process start time: %v", err.Error())
168+
}
169+
mm.RegisterMountMetric()
161170
}
162171
metricsManager = &mm
163172
}
@@ -266,6 +275,7 @@ func handle() {
266275
EnableDataCache: *enableDataCacheFlag,
267276
DataCacheEnabledNodePool: isDataCacheEnabledNodePool,
268277
SysfsPath: "/sys",
278+
MetricsManager: metricsManager,
269279
}
270280
nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs)
271281

pkg/gce-pd-csi-driver/gce-pd-driver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ func NewNodeServer(gceDriver *GCEDriver, mounter *mount.SafeFormatAndMount, devi
158158
EnableDataCache: args.EnableDataCache,
159159
DataCacheEnabledNodePool: args.DataCacheEnabledNodePool,
160160
SysfsPath: args.SysfsPath,
161+
metricsManager: args.MetricsManager,
161162
}
162163
}
163164

pkg/gce-pd-csi-driver/node.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
4141
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils"
4242
metadataservice "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/metadata"
43+
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics"
4344
mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager"
4445
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/resizefs"
4546
)
@@ -77,6 +78,8 @@ type GCENodeServer struct {
7778
// Embed UnimplementedNodeServer to ensure the driver returns Unimplemented for any
7879
// new RPC methods that might be introduced in future versions of the spec.
7980
csi.UnimplementedNodeServer
81+
82+
metricsManager *metrics.MetricsManager
8083
}
8184

8285
type NodeServerArgs struct {
@@ -92,6 +95,8 @@ type NodeServerArgs struct {
9295

9396
// SysfsPath defaults to "/sys", except if it's a unit test.
9497
SysfsPath string
98+
99+
MetricsManager *metrics.MetricsManager
95100
}
96101

97102
var _ csi.NodeServer = &GCENodeServer{}
@@ -442,6 +447,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
442447
return &csi.NodeStageVolumeResponse{}, nil
443448
}
444449
}
450+
445451
return nil, status.Error(codes.Internal,
446452
fmt.Sprintf("Failed to format and mount device from (%q) to (%q) with fstype (%q) and options (%q): %v",
447453
devicePath, stagingTargetPath, fstype, options, err.Error()))

pkg/gce-pd-csi-driver/utils_linux.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,12 @@ func (ns *GCENodeServer) formatAndMount(source, target, fstype string, options [
7575
}
7676
}()
7777
}
78-
return m.FormatAndMount(source, target, fstype, options)
78+
79+
err := m.FormatAndMount(source, target, fstype, options)
80+
if ns.metricsManager != nil {
81+
ns.metricsManager.RecordMountErrorMetric(fstype, err)
82+
}
83+
return err
7984
}
8085

8186
func preparePublishPath(path string, m *mount.SafeFormatAndMount) error {

pkg/metrics/metrics.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"google.golang.org/grpc/codes"
2626
"k8s.io/component-base/metrics"
2727
"k8s.io/klog/v2"
28+
"k8s.io/mount-utils"
2829
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
2930
)
3031

@@ -53,6 +54,15 @@ var (
5354
StabilityLevel: metrics.ALPHA,
5455
},
5556
[]string{"driver_name", "method_name", "grpc_status_code", "disk_type", "enable_confidential_storage", "enable_storage_pools"})
57+
58+
mountErrorMetric = metrics.NewCounterVec(&metrics.CounterOpts{
59+
Subsystem: "node",
60+
Name: "mount_errors",
61+
Help: "Node server file system mounting errors",
62+
StabilityLevel: metrics.ALPHA,
63+
},
64+
[]string{"driver_name", "file_system_format", "error_type"},
65+
)
5666
)
5767

5868
type MetricsManager struct {
@@ -78,6 +88,10 @@ func (mm *MetricsManager) RegisterPDCSIMetric() {
7888
mm.registry.MustRegister(pdcsiOperationErrorsMetric)
7989
}
8090

91+
func (mm *MetricsManager) RegisterMountMetric() {
92+
mm.registry.MustRegister(mountErrorMetric)
93+
}
94+
8195
func (mm *MetricsManager) recordComponentVersionMetric() error {
8296
v := getEnvVar(envGKEPDCSIVersion)
8397
if v == "" {
@@ -101,6 +115,16 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
101115
klog.Infof("Recorded PDCSI operation error code: %q", errCode)
102116
}
103117

118+
func (mm *MetricsManager) RecordMountErrorMetric(fs_format string, err error) {
119+
errType := mountErrorType(err)
120+
mountErrorMetric.WithLabelValues(pdcsiDriverName, fs_format, errType).Inc()
121+
klog.Infof("Recorded mount error type: %q", errType)
122+
}
123+
124+
func (mm *MetricsManager) EmmitProcessStartTime() error {
125+
return metrics.RegisterProcessStartTime(mm.registry.Register)
126+
}
127+
104128
func (mm *MetricsManager) EmitGKEComponentVersion() error {
105129
mm.registerComponentVersionMetric()
106130
if err := mm.recordComponentVersionMetric(); err != nil {
@@ -169,3 +193,16 @@ func errorCodeLabelValue(operationErr error) string {
169193
}
170194
return err
171195
}
196+
197+
func mountErrorType(err error) string {
198+
if err == nil {
199+
return "OK"
200+
}
201+
202+
mntErr := &mount.MountError{}
203+
if !errors.As(err, mntErr) {
204+
return "UnknownError"
205+
}
206+
207+
return string(mntErr.Type)
208+
}

pkg/metrics/metrics_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"google.golang.org/api/googleapi"
3030
"google.golang.org/grpc/codes"
3131
"google.golang.org/grpc/status"
32+
"k8s.io/mount-utils"
3233

3334
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
3435
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
@@ -186,3 +187,35 @@ func TestErrorCodeLabelValue(t *testing.T) {
186187
}
187188
}
188189
}
190+
191+
func TestMountOperationError(t *testing.T) {
192+
testCases := []struct {
193+
name string
194+
err error
195+
want string
196+
}{
197+
{
198+
name: "no error",
199+
want: "OK",
200+
},
201+
{
202+
name: "unknown error",
203+
err: fmt.Errorf("fake error"),
204+
want: "UnknownError",
205+
},
206+
{
207+
name: "mount error",
208+
err: mount.NewMountError(mount.FormatFailed, "file system format failed"),
209+
want: string(mount.FormatFailed),
210+
},
211+
}
212+
213+
for _, tc := range testCases {
214+
t.Run(tc.name, func(t *testing.T) {
215+
got := mountErrorType(tc.err)
216+
if diff := cmp.Diff(tc.want, got); diff != "" {
217+
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
218+
}
219+
})
220+
}
221+
}

0 commit comments

Comments
 (0)