Skip to content

Commit e07b0c8

Browse files
authored
[ws-manager-mk2] Refactor metric collection and add tests (#16585)
1 parent bec8713 commit e07b0c8

File tree

4 files changed

+200
-35
lines changed

4 files changed

+200
-35
lines changed

components/ws-manager-mk2/controllers/metrics.go

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"strings"
1010
"time"
1111

12+
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
1213
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
1314
"github.com/go-logr/logr"
1415
lru "github.com/hashicorp/golang-lru"
@@ -185,22 +186,54 @@ func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *work
185186
counter.Inc()
186187
}
187188

188-
func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace) {
189-
m.cache.Add(ws.Name, ws.Status.Phase)
189+
func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace, state *metricState) {
190+
var s metricState
191+
if state != nil {
192+
s = *state
193+
} else {
194+
s = newMetricState(ws)
195+
}
196+
m.cache.Add(ws.Name, s)
190197
}
191198

192199
func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) {
193200
m.cache.Remove(ws.Name)
194201
}
195202

196-
func (m *controllerMetrics) shouldUpdate(log *logr.Logger, ws *workspacev1.Workspace) bool {
197-
p, ok := m.cache.Get(ws.Name)
203+
// metricState is used to track which metrics have been recorded for a workspace.
204+
type metricState struct {
205+
phase workspacev1.WorkspacePhase
206+
recordedStartTime bool
207+
recordedInitFailure bool
208+
recordedStartFailure bool
209+
recordedContentReady bool
210+
recordedBackupFailed bool
211+
recordedBackupCompleted bool
212+
}
213+
214+
func newMetricState(ws *workspacev1.Workspace) metricState {
215+
return metricState{
216+
phase: ws.Status.Phase,
217+
// Here we assume that we've recorded metrics for the following states already if their conditions already exist.
218+
// This is to prevent these from being re-recorded after the controller restarts and clears the metric state for
219+
// each workspace.
220+
recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning,
221+
recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure"),
222+
recordedStartFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)),
223+
recordedContentReady: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)),
224+
recordedBackupFailed: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)),
225+
recordedBackupCompleted: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)),
226+
}
227+
}
228+
229+
// getWorkspace returns the last recorded metric state for that workspace.
230+
func (m *controllerMetrics) getWorkspace(log *logr.Logger, ws *workspacev1.Workspace) (bool, metricState) {
231+
s, ok := m.cache.Get(ws.Name)
198232
if !ok {
199-
return false
233+
return false, metricState{}
200234
}
201235

202-
phase := p.(workspacev1.WorkspacePhase)
203-
return phase != ws.Status.Phase
236+
return true, s.(metricState)
204237
}
205238

206239
// Describe implements Collector. It will send exactly one Desc to the provided channel.

components/ws-manager-mk2/controllers/suite_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ var (
5050
ctx context.Context
5151
cancel context.CancelFunc
5252
wsActivity *activity.WorkspaceActivity
53+
wsMetrics *controllerMetrics
5354
)
5455

5556
var _ = BeforeSuite(func() {
@@ -102,6 +103,7 @@ var _ = BeforeSuite(func() {
102103

103104
conf := newTestConfig()
104105
wsReconciler, err := NewWorkspaceReconciler(k8sManager.GetClient(), k8sManager.GetScheme(), &conf, metrics.Registry)
106+
wsMetrics = wsReconciler.metrics
105107
Expect(err).ToNot(HaveOccurred())
106108
Expect(wsReconciler.SetupWithManager(k8sManager)).To(Succeed())
107109

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
159159
// need to be deleted and re-created
160160
workspace.Status.PodStarts++
161161
}
162-
r.metrics.rememberWorkspace(workspace)
162+
r.metrics.rememberWorkspace(workspace, nil)
163163

164164
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
165165
// Done stopping workspace - remove finalizer.
@@ -249,48 +249,58 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
249249
func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *workspacev1.Workspace) {
250250
log := log.FromContext(ctx)
251251

252-
phase := workspace.Status.Phase
253-
254-
if !r.metrics.shouldUpdate(&log, workspace) {
252+
ok, lastState := r.metrics.getWorkspace(&log, workspace)
253+
if !ok {
255254
return
256255
}
257256

258-
switch {
259-
case phase == workspacev1.WorkspacePhasePending ||
260-
phase == workspacev1.WorkspacePhaseCreating ||
261-
phase == workspacev1.WorkspacePhaseInitializing:
257+
if !lastState.recordedInitFailure && wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") {
258+
r.metrics.countTotalRestoreFailures(&log, workspace)
259+
lastState.recordedInitFailure = true
262260

263-
if wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") {
264-
r.metrics.countTotalRestoreFailures(&log, workspace)
261+
if !lastState.recordedStartFailure {
265262
r.metrics.countWorkspaceStartFailures(&log, workspace)
263+
lastState.recordedStartFailure = true
266264
}
265+
}
267266

268-
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
269-
r.metrics.countWorkspaceStartFailures(&log, workspace)
270-
}
267+
if !lastState.recordedStartFailure && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
268+
// Only record if there was no other start failure recorded yet, to ensure max one
269+
// start failure gets recorded per workspace.
270+
r.metrics.countWorkspaceStartFailures(&log, workspace)
271+
lastState.recordedStartFailure = true
272+
}
271273

272-
case phase == workspacev1.WorkspacePhaseRunning:
273-
r.metrics.recordWorkspaceStartupTime(&log, workspace)
274-
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) {
275-
r.metrics.countTotalRestores(&log, workspace)
276-
}
274+
if !lastState.recordedContentReady && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) {
275+
r.metrics.countTotalRestores(&log, workspace)
276+
lastState.recordedContentReady = true
277+
}
277278

278-
case phase == workspacev1.WorkspacePhaseStopped:
279-
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) {
280-
r.metrics.countTotalBackups(&log, workspace)
281-
r.metrics.countTotalBackupFailures(&log, workspace)
282-
}
279+
if !lastState.recordedBackupFailed && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) {
280+
r.metrics.countTotalBackups(&log, workspace)
281+
r.metrics.countTotalBackupFailures(&log, workspace)
282+
lastState.recordedBackupFailed = true
283+
}
283284

284-
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
285-
r.metrics.countTotalBackups(&log, workspace)
286-
}
285+
if !lastState.recordedBackupCompleted && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
286+
r.metrics.countTotalBackups(&log, workspace)
287+
lastState.recordedBackupCompleted = true
288+
}
287289

290+
if !lastState.recordedStartTime && workspace.Status.Phase == workspacev1.WorkspacePhaseRunning {
291+
r.metrics.recordWorkspaceStartupTime(&log, workspace)
292+
lastState.recordedStartTime = true
293+
}
294+
295+
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped {
288296
r.metrics.countWorkspaceStop(&log, workspace)
297+
298+
// Forget about this workspace, no more state updates will be recorded after this.
289299
r.metrics.forgetWorkspace(workspace)
290300
return
291301
}
292302

293-
r.metrics.rememberWorkspace(workspace)
303+
r.metrics.rememberWorkspace(workspace, &lastState)
294304
}
295305

296306
func (r *WorkspaceReconciler) deleteWorkspacePod(ctx context.Context, pod *corev1.Pod, reason string) (ctrl.Result, error) {

0 commit comments

Comments
 (0)