Skip to content

Commit d686ea9

Browse files
jentingroboquat
authored andcommitted
Add counter metrics to record error count
Add two metrics to record the `mount device failed` and `cannt mount volume` error. So, we could know the frequency the error happened. Signed-off-by: JenTing Hsiao <[email protected]>
1 parent 0a130f1 commit d686ea9

File tree

4 files changed

+28
-26
lines changed

4 files changed

+28
-26
lines changed

components/ws-manager/pkg/manager/metrics.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ type metrics struct {
5252
totalRestoreCounterVec *prometheus.CounterVec
5353
totalRestoreFailureCounterVec *prometheus.CounterVec
5454
totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec
55+
totalMountDeviceFailedVec *prometheus.CounterVec
56+
totalCannotMountVolumeVec *prometheus.CounterVec
5557

5658
// Gauge
5759
totalOpenPortGauge prometheus.GaugeFunc
@@ -142,6 +144,18 @@ func newMetrics(m *Manager) *metrics {
142144
Name: "workspace_unintentional_stop_total",
143145
Help: "total number of workspaces when container stopped without being deleted prior",
144146
}, []string{"type", "class"}),
147+
totalMountDeviceFailedVec: prometheus.NewCounterVec(prometheus.CounterOpts{
148+
Namespace: metricsNamespace,
149+
Subsystem: metricsWorkspaceSubsystem,
150+
Name: "workspace_mount_device_failed",
151+
Help: "total number of workspace mount device failed",
152+
}, []string{"type", "class"}),
153+
totalCannotMountVolumeVec: prometheus.NewCounterVec(prometheus.CounterOpts{
154+
Namespace: metricsNamespace,
155+
Subsystem: metricsWorkspaceSubsystem,
156+
Name: "workspace_cannot_mount_volume",
157+
Help: "total number of workspace cannot mount volume",
158+
}, []string{"type", "class"}),
145159
totalOpenPortGauge: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
146160
Namespace: metricsNamespace,
147161
Subsystem: metricsWorkspaceSubsystem,
@@ -205,6 +219,8 @@ func (m *metrics) Register(reg prometheus.Registerer) error {
205219
m.totalRestoreCounterVec,
206220
m.totalRestoreFailureCounterVec,
207221
m.totalUnintentionalWorkspaceStopCounterVec,
222+
m.totalMountDeviceFailedVec,
223+
m.totalCannotMountVolumeVec,
208224
m.totalOpenPortGauge,
209225
}
210226
for _, c := range collectors {

components/ws-manager/pkg/manager/status.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,8 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac
528528
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
529529
func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) {
530530
pod := wso.Pod
531+
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
532+
wsClass := pod.Labels[workspaceClassLabel]
531533

532534
// if the workspace was explicitely marked as failed that also constitutes a failure reason
533535
reason, explicitFailure := pod.Annotations[workspaceExplicitFailAnnotation]
@@ -590,8 +592,6 @@ func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.Worksp
590592
} else if terminationState.Reason == "Completed" {
591593
// container terminated successfully - this is not a failure
592594
if !isPodBeingDeleted(pod) {
593-
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
594-
wsClass := pod.Labels[workspaceClassLabel]
595595
if metrics != nil && !wso.IsWorkspaceHeadless() {
596596
metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc()
597597
}
@@ -618,9 +618,17 @@ func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.Worksp
618618
if strings.Contains(evt.Message, "MountVolume.MountDevice failed for volume") {
619619
// ref: https://github.com/gitpod-io/gitpod/issues/13353
620620
// ref: https://github.com/kubernetes-sigs/gcp-compute-persistent-disk-csi-driver/issues/608
621+
log.WithField("pod", pod.Name).Warnf("%s", evt.Message)
622+
if metrics != nil {
623+
metrics.totalMountDeviceFailedVec.WithLabelValues(wsType, wsClass).Inc()
624+
}
621625
return "", nil
622626
} else if strings.Contains(evt.Message, workspaceVolumeName) {
623627
// ref: https://github.com/gitpod-io/gitpod/issues/14032
628+
log.WithField("pod", pod.Name).Warnf("%s", evt.Message)
629+
if metrics != nil {
630+
metrics.totalCannotMountVolumeVec.WithLabelValues(wsType, wsClass).Inc()
631+
}
624632
return "", nil
625633
} else {
626634
// if this happens we did not do a good job because that means we've introduced another volume to the pod
Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,3 @@
11
{
2-
"actions": [
3-
{
4-
"Func": "markWorkspace",
5-
"Params": {
6-
"annotations": [
7-
{
8-
"Name": "gitpod/failedBeforeStopping",
9-
"Value": "true",
10-
"Delete": false
11-
}
12-
],
13-
"workspaceID": "b3242d9b-6920-41b5-8e72-c3d5637ca148"
14-
}
15-
},
16-
{
17-
"Func": "stopWorkspace",
18-
"Params": {
19-
"gracePeriod": 30000000000,
20-
"workspaceID": "b3242d9b-6920-41b5-8e72-c3d5637ca148"
21-
}
22-
}
23-
]
24-
}
2+
"actions": null
3+
}

components/ws-manager/pkg/manager/testdata/status_failedWorkspaceMount_PENDING00.golden

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
},
1717
"phase": 1,
1818
"conditions": {
19-
"failed": "cannot mount workspace",
2019
"volume_snapshot": {}
2120
},
2221
"message": "pod is pending",

0 commit comments

Comments
 (0)