diff --git a/components/ws-manager-mk2/controllers/metrics.go b/components/ws-manager-mk2/controllers/metrics.go index 48de1f53f197c4..03f644c7bd45e1 100644 --- a/components/ws-manager-mk2/controllers/metrics.go +++ b/components/ws-manager-mk2/controllers/metrics.go @@ -9,6 +9,7 @@ import ( "strings" "time" + wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes" workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" "github.com/go-logr/logr" lru "github.com/hashicorp/golang-lru" @@ -185,22 +186,54 @@ func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *work counter.Inc() } -func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace) { - m.cache.Add(ws.Name, ws.Status.Phase) +func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace, state *metricState) { + var s metricState + if state != nil { + s = *state + } else { + s = newMetricState(ws) + } + m.cache.Add(ws.Name, s) } func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) { m.cache.Remove(ws.Name) } -func (m *controllerMetrics) shouldUpdate(log *logr.Logger, ws *workspacev1.Workspace) bool { - p, ok := m.cache.Get(ws.Name) +// metricState is used to track which metrics have been recorded for a workspace. +type metricState struct { + phase workspacev1.WorkspacePhase + recordedStartTime bool + recordedInitFailure bool + recordedStartFailure bool + recordedContentReady bool + recordedBackupFailed bool + recordedBackupCompleted bool +} + +func newMetricState(ws *workspacev1.Workspace) metricState { + return metricState{ + phase: ws.Status.Phase, + // Here we assume that we've recorded metrics for the following states already if their conditions already exist. + // This is to prevent these from being re-recorded after the controller restarts and clears the metric state for + // each workspace. + recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning, + recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure"), + recordedStartFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)), + recordedContentReady: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)), + recordedBackupFailed: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)), + recordedBackupCompleted: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)), + } +} + +// getWorkspace returns the last recorded metric state for that workspace. +func (m *controllerMetrics) getWorkspace(log *logr.Logger, ws *workspacev1.Workspace) (bool, metricState) { + s, ok := m.cache.Get(ws.Name) if !ok { - return false + return false, metricState{} } - phase := p.(workspacev1.WorkspacePhase) - return phase != ws.Status.Phase + return true, s.(metricState) } // Describe implements Collector. It will send exactly one Desc to the provided channel. diff --git a/components/ws-manager-mk2/controllers/suite_test.go b/components/ws-manager-mk2/controllers/suite_test.go index df358bf3c35679..c28484ad0e52e1 100644 --- a/components/ws-manager-mk2/controllers/suite_test.go +++ b/components/ws-manager-mk2/controllers/suite_test.go @@ -50,6 +50,7 @@ var ( ctx context.Context cancel context.CancelFunc wsActivity *activity.WorkspaceActivity + wsMetrics *controllerMetrics ) var _ = BeforeSuite(func() { @@ -102,6 +103,7 @@ var _ = BeforeSuite(func() { conf := newTestConfig() wsReconciler, err := NewWorkspaceReconciler(k8sManager.GetClient(), k8sManager.GetScheme(), &conf, metrics.Registry) + wsMetrics = wsReconciler.metrics Expect(err).ToNot(HaveOccurred()) Expect(wsReconciler.SetupWithManager(k8sManager)).To(Succeed()) diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go index 1192084e0ae311..91b79048a6c243 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller.go +++ b/components/ws-manager-mk2/controllers/workspace_controller.go @@ -159,7 +159,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp // need to be deleted and re-created workspace.Status.PodStarts++ } - r.metrics.rememberWorkspace(workspace) + r.metrics.rememberWorkspace(workspace, nil) case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped: // Done stopping workspace - remove finalizer. @@ -249,48 +249,58 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *workspacev1.Workspace) { log := log.FromContext(ctx) - phase := workspace.Status.Phase - - if !r.metrics.shouldUpdate(&log, workspace) { + ok, lastState := r.metrics.getWorkspace(&log, workspace) + if !ok { return } - switch { - case phase == workspacev1.WorkspacePhasePending || - phase == workspacev1.WorkspacePhaseCreating || - phase == workspacev1.WorkspacePhaseInitializing: + if !lastState.recordedInitFailure && wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") { + r.metrics.countTotalRestoreFailures(&log, workspace) + lastState.recordedInitFailure = true - if wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") { - r.metrics.countTotalRestoreFailures(&log, workspace) + if !lastState.recordedStartFailure { r.metrics.countWorkspaceStartFailures(&log, workspace) + lastState.recordedStartFailure = true } + } - if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) { - r.metrics.countWorkspaceStartFailures(&log, workspace) - } + if !lastState.recordedStartFailure && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) { + // Only record if there was no other start failure recorded yet, to ensure max one + // start failure gets recorded per workspace. + r.metrics.countWorkspaceStartFailures(&log, workspace) + lastState.recordedStartFailure = true + } - case phase == workspacev1.WorkspacePhaseRunning: - r.metrics.recordWorkspaceStartupTime(&log, workspace) - if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) { - r.metrics.countTotalRestores(&log, workspace) - } + if !lastState.recordedContentReady && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) { + r.metrics.countTotalRestores(&log, workspace) + lastState.recordedContentReady = true + } - case phase == workspacev1.WorkspacePhaseStopped: - if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) { - r.metrics.countTotalBackups(&log, workspace) - r.metrics.countTotalBackupFailures(&log, workspace) - } + if !lastState.recordedBackupFailed && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) { + r.metrics.countTotalBackups(&log, workspace) + r.metrics.countTotalBackupFailures(&log, workspace) + lastState.recordedBackupFailed = true + } - if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) { - r.metrics.countTotalBackups(&log, workspace) - } + if !lastState.recordedBackupCompleted && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) { + r.metrics.countTotalBackups(&log, workspace) + lastState.recordedBackupCompleted = true + } + if !lastState.recordedStartTime && workspace.Status.Phase == workspacev1.WorkspacePhaseRunning { + r.metrics.recordWorkspaceStartupTime(&log, workspace) + lastState.recordedStartTime = true + } + + if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped { r.metrics.countWorkspaceStop(&log, workspace) + + // Forget about this workspace, no more state updates will be recorded after this. r.metrics.forgetWorkspace(workspace) return } - r.metrics.rememberWorkspace(workspace) + r.metrics.rememberWorkspace(workspace, &lastState) } func (r *WorkspaceReconciler) deleteWorkspacePod(ctx context.Context, pod *corev1.Pod, reason string) (ctrl.Result, error) { diff --git a/components/ws-manager-mk2/controllers/workspace_controller_test.go b/components/ws-manager-mk2/controllers/workspace_controller_test.go index 9fdcb685eda6ae..5ca6fda23a845e 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller_test.go +++ b/components/ws-manager-mk2/controllers/workspace_controller_test.go @@ -14,6 +14,9 @@ import ( "github.com/google/uuid" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" "google.golang.org/protobuf/proto" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -30,6 +33,7 @@ var _ = Describe("WorkspaceController", func() { Context("with regular workspaces", func() { It("should handle successful workspace creation and stop request", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) Expect(controllerutil.ContainsFinalizer(pod, workspacev1.GitpodFinalizerName)).To(BeTrue()) @@ -56,7 +60,19 @@ var _ = Describe("WorkspaceController", func() { g.Expect(ws.Status.URL).ToNot(BeEmpty()) }, timeout, interval).Should(Succeed()) - // TODO(wv): Once implemented, expect EverReady condition. + // Transition Pod to running, and expect workspace to reach Running phase. + // This should also cause e.g. startup time metrics to be recorded. + updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) { + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: "workspace", + Ready: true, + }} + }) + + expectPhaseEventually(ws, workspacev1.WorkspacePhaseRunning) + + markContentReady(ws) requestStop(ws) @@ -68,10 +84,18 @@ var _ = Describe("WorkspaceController", func() { Consistently(func() error { return checkNotFound(pod) }, duration, interval).Should(Succeed(), "pod came back") + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + starts: 1, + restores: 1, + stops: 1, + backups: 1, + }) }) It("should handle content init failure", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) By("adding ws init failure condition") @@ -87,10 +111,17 @@ var _ = Describe("WorkspaceController", func() { // On init failure, expect workspace cleans up without a backup. expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + startFailures: 1, + restoreFailures: 1, + stops: 1, + }) }) It("should handle backup failure", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) // Stop the workspace. @@ -101,10 +132,17 @@ var _ = Describe("WorkspaceController", func() { // Workspace should get cleaned up. expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + backups: 1, + backupFailures: 1, + stops: 1, + }) }) It("should handle workspace failure", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) // Update Pod with failed exit status. @@ -125,10 +163,17 @@ var _ = Describe("WorkspaceController", func() { expectFinalizerAndMarkBackupCompleted(ws, pod) expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + startFailures: 1, + stops: 1, + backups: 1, + }) }) It("should clean up timed out workspaces", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) By("adding Timeout condition") @@ -143,10 +188,16 @@ var _ = Describe("WorkspaceController", func() { expectFinalizerAndMarkBackupCompleted(ws, pod) expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + stops: 1, + backups: 1, + }) }) It("should handle workspace abort", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) // Update Pod with stop and abort conditions. @@ -165,10 +216,15 @@ var _ = Describe("WorkspaceController", func() { // Expect cleanup without a backup. expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + stops: 1, + }) }) It("deleting workspace resource should gracefully clean up", func() { ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) pod := createWorkspaceExpectPod(ws) Expect(k8sClient.Delete(ctx, ws)).To(Succeed()) @@ -178,6 +234,11 @@ var _ = Describe("WorkspaceController", func() { expectFinalizerAndMarkBackupCompleted(ws, pod) expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + stops: 1, + backups: 1, + }) }) }) @@ -329,6 +390,19 @@ func requestStop(ws *workspacev1.Workspace) { }) } +func markContentReady(ws *workspacev1.Workspace) { + GinkgoHelper() + By("adding content ready condition") + updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) { + ws.Status.Conditions = wsk8s.AddUniqueCondition(ws.Status.Conditions, metav1.Condition{ + Type: string(workspacev1.WorkspaceConditionContentReady), + Status: metav1.ConditionTrue, + Reason: "InitializationSuccess", + LastTransitionTime: metav1.Now(), + }) + }) +} + func expectFinalizerAndMarkBackupCompleted(ws *workspacev1.Workspace, pod *corev1.Pod) { GinkgoHelper() // Checking for the finalizer enforces our expectation that the workspace @@ -476,3 +550,49 @@ func newWorkspace(name, namespace string) *workspacev1.Workspace { }, } } + +type metricCounts struct { + starts int + startFailures int + stops int + backups int + backupFailures int + restores int + restoreFailures int +} + +// collectHistCount is a hack to get the value of the histogram's sample count. +// testutil.ToFloat64() does not accept histograms. +func collectHistCount(h prometheus.Histogram) uint64 { + GinkgoHelper() + pb := &dto.Metric{} + Expect(h.Write(pb)).To(Succeed()) + return pb.Histogram.GetSampleCount() +} + +func collectMetricCounts(wsMetrics *controllerMetrics, ws *workspacev1.Workspace) metricCounts { + tpe := string(ws.Spec.Type) + cls := ws.Spec.Class + startHist := wsMetrics.startupTimeHistVec.WithLabelValues(tpe, cls).(prometheus.Histogram) + return metricCounts{ + starts: int(collectHistCount(startHist)), + startFailures: int(testutil.ToFloat64(wsMetrics.totalStartsFailureCounterVec.WithLabelValues(tpe, cls))), + stops: int(testutil.ToFloat64(wsMetrics.totalStopsCounterVec.WithLabelValues("unknown", tpe, cls))), + backups: int(testutil.ToFloat64(wsMetrics.totalBackupCounterVec.WithLabelValues(tpe, cls))), + backupFailures: int(testutil.ToFloat64(wsMetrics.totalBackupFailureCounterVec.WithLabelValues(tpe, cls))), + restores: int(testutil.ToFloat64(wsMetrics.totalRestoreCounterVec.WithLabelValues(tpe, cls))), + restoreFailures: int(testutil.ToFloat64(wsMetrics.totalRestoreFailureCounterVec.WithLabelValues(tpe, cls))), + } +} + +func expectMetricsDelta(initial metricCounts, cur metricCounts, expectedDelta metricCounts) { + GinkgoHelper() + By("checking metrics have been recorded") + Expect(cur.starts-initial.starts).To(Equal(expectedDelta.starts), "expected metric count delta for starts") + Expect(cur.startFailures-initial.startFailures).To(Equal(expectedDelta.startFailures), "expected metric count delta for startFailures") + Expect(cur.stops-initial.stops).To(Equal(expectedDelta.stops), "expected metric count delta for stops") + Expect(cur.backups-initial.backups).To(Equal(expectedDelta.backups), "expected metric count delta for backups") + Expect(cur.backupFailures-initial.backupFailures).To(Equal(expectedDelta.backupFailures), "expected metric count delta for backupFailures") + Expect(cur.restores-initial.restores).To(Equal(expectedDelta.restores), "expected metric count delta for restores") + Expect(cur.restoreFailures-initial.restoreFailures).To(Equal(expectedDelta.restoreFailures), "expected metric count delta for restoreFailures") +}