diff --git a/components/ws-daemon/pkg/controller/controller.go b/components/ws-daemon/pkg/controller/controller.go index 6cb64b7e7bf598..96f814c657a2aa 100644 --- a/components/ws-daemon/pkg/controller/controller.go +++ b/components/ws-daemon/pkg/controller/controller.go @@ -53,6 +53,7 @@ type WorkspaceController struct { NodeName string opts *WorkspaceControllerOpts operations WorkspaceOperations + metrics *workspaceMetrics } func NewWorkspaceController(c client.Client, opts WorkspaceControllerOpts) (*WorkspaceController, error) { @@ -71,6 +72,9 @@ func NewWorkspaceController(c client.Client, opts WorkspaceControllerOpts) (*Wor return nil, err } + metrics := newWorkspaceMetrics() + opts.MetricsRegistry.Register(metrics) + ops, err := NewWorkspaceOperations(opts.ContentConfig, store, opts.MetricsRegistry) if err != nil { return nil, err @@ -81,6 +85,7 @@ func NewWorkspaceController(c client.Client, opts WorkspaceControllerOpts) (*Wor NodeName: opts.NodeName, opts: &opts, operations: *ops, + metrics: metrics, }, nil } @@ -155,6 +160,7 @@ func (wsc *WorkspaceController) handleWorkspaceInit(ctx context.Context, ws *wor return ctrl.Result{}, err } + initStart := time.Now() alreadyInit, failure, err := wsc.operations.InitWorkspaceContent(ctx, InitContentOptions{ Meta: WorkspaceMeta{ Owner: ws.Spec.Ownership.Owner, @@ -198,6 +204,10 @@ func (wsc *WorkspaceController) handleWorkspaceInit(ctx context.Context, ws *wor return wsc.Status().Update(ctx, ws) }) + if err == nil { + wsc.metrics.recordInitializeTime(time.Since(initStart).Seconds(), ws) + } + return ctrl.Result{}, err } @@ -220,6 +230,7 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor return ctrl.Result{}, nil } + disposeStart := time.Now() alreadyDisposing, gitStatus, disposeErr := wsc.operations.DisposeWorkspace(ctx, DisposeOptions{ Meta: WorkspaceMeta{ Owner: ws.Spec.Ownership.Owner, @@ -266,6 +277,10 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor return wsc.Status().Update(ctx, ws) }) + if err == nil { + wsc.metrics.recordFinalizeTime(time.Since(disposeStart).Seconds(), ws) + } + return ctrl.Result{}, err } @@ -285,3 +300,63 @@ func toWorkspaceGitStatus(status *csapi.GitStatus) *workspacev1.GitStatus { TotalUnpushedCommits: status.TotalUnpushedCommits, } } + +type workspaceMetrics struct { + initializeTimeHistVec *prometheus.HistogramVec + finalizeTimeHistVec *prometheus.HistogramVec +} + +func newWorkspaceMetrics() *workspaceMetrics { + return &workspaceMetrics{ + initializeTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "gitpod", + Subsystem: "ws_daemon", + Name: "workspace_initialize_seconds", + Help: "time it took to initialize workspace", + Buckets: prometheus.ExponentialBuckets(2, 2, 10), + }, []string{"type", "class"}), + finalizeTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "gitpod", + Subsystem: "ws_daemon", + Name: "workspace_finalize_seconds", + Help: "time it took to finalize workspace", + Buckets: prometheus.ExponentialBuckets(2, 2, 10), + }, []string{"type", "class"}), + } +} + +func (m *workspaceMetrics) recordInitializeTime(duration float64, ws *workspacev1.Workspace) { + tpe := string(ws.Spec.Type) + class := ws.Spec.Class + + hist, err := m.initializeTimeHistVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + glog.WithError(err).WithField("type", tpe).WithField("class", class).Infof("could not retrieve initialize metric") + } + + hist.Observe(duration) +} + +func (m *workspaceMetrics) recordFinalizeTime(duration float64, ws *workspacev1.Workspace) { + tpe := string(ws.Spec.Type) + class := ws.Spec.Class + + hist, err := m.finalizeTimeHistVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + glog.WithError(err).WithField("type", tpe).WithField("class", class).Infof("could not retrieve finalize metric") + } + + hist.Observe(duration) +} + +// Describe implements Collector. It will send exactly one Desc to the provided channel. +func (m *workspaceMetrics) Describe(ch chan<- *prometheus.Desc) { + m.initializeTimeHistVec.Describe(ch) + m.finalizeTimeHistVec.Describe(ch) +} + +// Collect implements Collector. +func (m *workspaceMetrics) Collect(ch chan<- prometheus.Metric) { + m.initializeTimeHistVec.Collect(ch) + m.finalizeTimeHistVec.Collect(ch) +} diff --git a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml index d723616da4083f..ea512182b5a8b2 100644 --- a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml +++ b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml @@ -1,6 +1,6 @@ -# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Copyright (c) 2023 Gitpod GmbH. All rights reserved. # Licensed under the GNU Affero General Public License (AGPL). -# See License-AGPL.txt in the project root for license information. +# See License.AGPL.txt in the project root for license information. --- apiVersion: apiextensions.k8s.io/v1 diff --git a/components/ws-manager-mk2/config/webhook/manifests.yaml b/components/ws-manager-mk2/config/webhook/manifests.yaml index 7b87f90c4b563a..9c594e2717cb89 100644 --- a/components/ws-manager-mk2/config/webhook/manifests.yaml +++ b/components/ws-manager-mk2/config/webhook/manifests.yaml @@ -1,6 +1,6 @@ -# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Copyright (c) 2023 Gitpod GmbH. All rights reserved. # Licensed under the GNU Affero General Public License (AGPL). -# See License-AGPL.txt in the project root for license information. +# See License.AGPL.txt in the project root for license information. --- apiVersion: admissionregistration.k8s.io/v1 diff --git a/components/ws-manager-mk2/controllers/metrics.go b/components/ws-manager-mk2/controllers/metrics.go new file mode 100644 index 00000000000000..48de1f53f197c4 --- /dev/null +++ b/components/ws-manager-mk2/controllers/metrics.go @@ -0,0 +1,343 @@ +// Copyright (c) 2023 Gitpod GmbH. All rights reserved. +// Licensed under the GNU Affero General Public License (AGPL). +// See License.AGPL.txt in the project root for license information. + +package controllers + +import ( + "context" + "strings" + "time" + + workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" + "github.com/go-logr/logr" + lru "github.com/hashicorp/golang-lru" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + workspaceStartupSeconds string = "workspace_startup_seconds" + workspaceStartFailuresTotal string = "workspace_starts_failure_total" + workspaceStopsTotal string = "workspace_stops_total" + workspaceBackupsTotal string = "workspace_backups_total" + workspaceBackupFailuresTotal string = "workspace_backups_failure_total" + workspaceRestoresTotal string = "workspace_restores_total" + workspaceRestoresFailureTotal string = "workspace_restores_failure_total" +) + +type controllerMetrics struct { + startupTimeHistVec *prometheus.HistogramVec + totalStartsFailureCounterVec *prometheus.CounterVec + totalStopsCounterVec *prometheus.CounterVec + + totalBackupCounterVec *prometheus.CounterVec + totalBackupFailureCounterVec *prometheus.CounterVec + totalRestoreCounterVec *prometheus.CounterVec + totalRestoreFailureCounterVec *prometheus.CounterVec + + workspacePhases *phaseTotalVec + timeoutSettings *timeoutSettingsVec + + // used to prevent recording metrics multiple times + cache *lru.Cache +} + +func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) { + cache, err := lru.New(6000) + if err != nil { + return nil, err + } + + return &controllerMetrics{ + startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceStartupSeconds, + Help: "time it took for workspace pods to reach the running phase", + Buckets: prometheus.ExponentialBuckets(2, 2, 10), + }, []string{"type", "class"}), + totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceStartFailuresTotal, + Help: "total number of workspaces that failed to start", + }, []string{"type", "class"}), + totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceStopsTotal, + Help: "total number of workspaces stopped", + }, []string{"reason", "type", "class"}), + + totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceBackupsTotal, + Help: "total number of workspace backups", + }, []string{"type", "class"}), + totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceBackupFailuresTotal, + Help: "total number of workspace backup failures", + }, []string{"type", "class"}), + totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceRestoresTotal, + Help: "total number of workspace restores", + }, []string{"type", "class"}), + totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceRestoresFailureTotal, + Help: "total number of workspace restore failures", + }, []string{"type", "class"}), + + workspacePhases: newPhaseTotalVec(r), + timeoutSettings: newTimeoutSettingsVec(r), + cache: cache, + }, nil +} + +func (m *controllerMetrics) recordWorkspaceStartupTime(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + hist, err := m.startupTimeHistVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.Error(err, "could not record workspace startup time", "type", tpe, "class", class) + } + + hist.Observe(float64(time.Since(ws.CreationTimestamp.Time).Seconds())) +} + +func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + counter, err := m.totalStartsFailureCounterVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.Error(err, "could not count workspace startup failure", "type", tpe, "class", class) + } + + counter.Inc() +} + +func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + counter, err := m.totalStopsCounterVec.GetMetricWithLabelValues("unknown", tpe, class) + if err != nil { + log.Error(err, "could not count workspace stop", "reason", "unknown", "type", tpe, "class", class) + } + + counter.Inc() +} + +func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + counter, err := m.totalBackupCounterVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.Error(err, "could not count workspace backup", "type", tpe, "class", class) + } + + counter.Inc() +} + +func (m *controllerMetrics) countTotalBackupFailures(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + counter, err := m.totalBackupFailureCounterVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.Error(err, "could not count workspace backup failure", "type", tpe, "class", class) + } + + counter.Inc() +} + +func (m *controllerMetrics) countTotalRestores(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + counter, err := m.totalRestoreCounterVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.Error(err, "could not count workspace restore", "type", tpe, "class", class) + } + + counter.Inc() +} + +func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + + counter, err := m.totalRestoreFailureCounterVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.Error(err, "could not count workspace restore failure", "type", tpe, "class", class) + } + + counter.Inc() +} + +func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace) { + m.cache.Add(ws.Name, ws.Status.Phase) +} + +func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) { + m.cache.Remove(ws.Name) +} + +func (m *controllerMetrics) shouldUpdate(log *logr.Logger, ws *workspacev1.Workspace) bool { + p, ok := m.cache.Get(ws.Name) + if !ok { + return false + } + + phase := p.(workspacev1.WorkspacePhase) + return phase != ws.Status.Phase +} + +// Describe implements Collector. It will send exactly one Desc to the provided channel. +func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) { + m.startupTimeHistVec.Describe(ch) + m.totalStopsCounterVec.Describe(ch) + m.totalStartsFailureCounterVec.Describe(ch) + + m.totalBackupCounterVec.Describe(ch) + m.totalBackupFailureCounterVec.Describe(ch) + m.totalRestoreCounterVec.Describe(ch) + m.totalRestoreFailureCounterVec.Describe(ch) + + m.workspacePhases.Describe(ch) + m.timeoutSettings.Describe(ch) +} + +// Collect implements Collector. +func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) { + m.startupTimeHistVec.Collect(ch) + m.totalStopsCounterVec.Collect(ch) + m.totalStartsFailureCounterVec.Collect(ch) + + m.totalBackupCounterVec.Collect(ch) + m.totalBackupFailureCounterVec.Collect(ch) + m.totalRestoreCounterVec.Collect(ch) + m.totalRestoreFailureCounterVec.Collect(ch) + + m.workspacePhases.Collect(ch) + m.timeoutSettings.Collect(ch) +} + +// phaseTotalVec returns a gauge vector counting the workspaces per phase +type phaseTotalVec struct { + name string + desc *prometheus.Desc + reconciler *WorkspaceReconciler +} + +func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec { + name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total") + return &phaseTotalVec{ + name: name, + desc: prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})), + reconciler: r, + } +} + +// Describe implements Collector. It will send exactly one Desc to the provided channel. +func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) { + ch <- ptv.desc +} + +// Collect implements Collector. +func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout) + defer cancel() + + var workspaces workspacev1.WorkspaceList + err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace)) + if err != nil { + return + } + + counts := make(map[string]int) + for _, ws := range workspaces.Items { + counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++ + } + + for key, count := range counts { + segs := strings.Split(key, "::") + tpe, phase, class := segs[0], segs[1], segs[2] + + metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class) + if err != nil { + continue + } + + ch <- metric + } +} + +// timeoutSettingsVec provides a gauge of the currently active/inactive workspaces. +// Adding both up returns the total number of workspaces. +type timeoutSettingsVec struct { + name string + reconciler *WorkspaceReconciler + desc *prometheus.Desc +} + +func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec { + name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total") + desc := prometheus.NewDesc( + name, + "Current number of workspaces per timeout setting", + []string{"timeout"}, + prometheus.Labels(map[string]string{}), + ) + return &timeoutSettingsVec{ + name: name, + reconciler: r, + desc: desc, + } +} + +// Describe implements Collector. It will send exactly one Desc to the provided channel. +func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) { + ch <- vec.desc +} + +// Collect implements Collector. +func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout) + defer cancel() + + var workspaces workspacev1.WorkspaceList + err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace)) + if err != nil { + return + } + + timeouts := make(map[time.Duration]int) + for _, ws := range workspaces.Items { + if ws.Spec.Timeout.Time == nil { + continue + } + + timeouts[ws.Spec.Timeout.Time.Duration]++ + } + + for phase, cnt := range timeouts { + // metrics cannot be re-used, we have to create them every single time + metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String()) + if err != nil { + continue + } + + ch <- metric + } +} diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go index 5feb03761bb23e..456210b7de3c90 100644 --- a/components/ws-manager-mk2/controllers/status.go +++ b/components/ws-manager-mk2/controllers/status.go @@ -32,6 +32,10 @@ func updateWorkspaceStatus(ctx context.Context, workspace *workspacev1.Workspace switch len(pods.Items) { case 0: + if workspace.Status.Phase == "" { + workspace.Status.Phase = workspacev1.WorkspacePhasePending + } + if workspace.Status.Phase != workspacev1.WorkspacePhasePending { workspace.Status.Phase = workspacev1.WorkspacePhaseStopped } @@ -91,7 +95,6 @@ func updateWorkspaceStatus(ctx context.Context, workspace *workspacev1.Workspace switch { case isPodBeingDeleted(pod): - log.Info("setting phase for workspace to stopping", "workspace", workspace.Name) workspace.Status.Phase = workspacev1.WorkspacePhaseStopping var hasFinalizer bool @@ -102,9 +105,10 @@ func updateWorkspaceStatus(ctx context.Context, workspace *workspacev1.Workspace } } if hasFinalizer { - // TODO(cw): if the condition isn't present or not true, we should re-trigger the reconiliation if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) || - conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) { + conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) || + conditionWithStatusAndReson(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") { + workspace.Status.Phase = workspacev1.WorkspacePhaseStopped } diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go index a13bfa3bccb5c0..7500a67dd121e3 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller.go +++ b/components/ws-manager-mk2/controllers/workspace_controller.go @@ -18,15 +18,31 @@ import ( config "github.com/gitpod-io/gitpod/ws-manager/api/config" workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" + "github.com/prometheus/client_golang/prometheus" ) -func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg config.Configuration) (*WorkspaceReconciler, error) { - res := &WorkspaceReconciler{ +const ( + metricsNamespace = "gitpod" + metricsWorkspaceSubsystem = "ws_manager_mk2" + // kubernetesOperationTimeout is the time we give Kubernetes operations in general. + kubernetesOperationTimeout = 5 * time.Second +) + +func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg config.Configuration, reg prometheus.Registerer) (*WorkspaceReconciler, error) { + reconciler := &WorkspaceReconciler{ Client: c, Scheme: scheme, Config: cfg, } - return res, nil + + metrics, err := newControllerMetrics(reconciler) + if err != nil { + return nil, err + } + reg.MustRegister(metrics) + reconciler.metrics = metrics + + return reconciler, nil } // WorkspaceReconciler reconciles a Workspace object @@ -35,6 +51,7 @@ type WorkspaceReconciler struct { Scheme *runtime.Scheme Config config.Configuration + metrics *controllerMetrics OnReconcile func(ctx context.Context, ws *workspacev1.Workspace) } @@ -84,6 +101,8 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } + r.updateMetrics(ctx, &workspace) + result, err := r.actOnStatus(ctx, &workspace, workspacePods) if err != nil { return result, err @@ -103,37 +122,46 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *workspacev1.Workspace, workspacePods corev1.PodList) (ctrl.Result, error) { - logger := log.FromContext(ctx) + log := log.FromContext(ctx) - // if there isn't a workspace pod and we're not currently deleting this workspace, - // create one. - if len(workspacePods.Items) == 0 && workspace.Status.PodStarts == 0 { - sctx, err := newStartWorkspaceContext(ctx, &r.Config, workspace) - if err != nil { - logger.Error(err, "unable to create startWorkspace context") - return ctrl.Result{Requeue: true}, err - } + if len(workspacePods.Items) == 0 { + // if there isn't a workspace pod and we're not currently deleting this workspace,// create one. + switch { + case workspace.Status.PodStarts == 0: + sctx, err := newStartWorkspaceContext(ctx, &r.Config, workspace) + if err != nil { + log.Error(err, "unable to create startWorkspace context") + return ctrl.Result{Requeue: true}, err + } - pod, err := r.createWorkspacePod(sctx) - if err != nil { - logger.Error(err, "unable to produce workspace pod") - return ctrl.Result{}, err - } + pod, err := r.createWorkspacePod(sctx) + if err != nil { + log.Error(err, "unable to produce workspace pod") + return ctrl.Result{}, err + } - if err := ctrl.SetControllerReference(workspace, pod, r.Scheme); err != nil { - return ctrl.Result{}, err - } + if err := ctrl.SetControllerReference(workspace, pod, r.Scheme); err != nil { + return ctrl.Result{}, err + } - err = r.Create(ctx, pod) - if errors.IsAlreadyExists(err) { - // pod exists, we're good - } else if err != nil { - logger.Error(err, "unable to create Pod for Workspace", "pod", pod) - return ctrl.Result{Requeue: true}, err - } else { - // TODO(cw): replicate the startup mechanism where pods can fail to be scheduled, - // need to be deleted and re-created - workspace.Status.PodStarts++ + err = r.Create(ctx, pod) + if errors.IsAlreadyExists(err) { + // pod exists, we're good + } else if err != nil { + log.Error(err, "unable to create Pod for Workspace", "pod", pod) + return ctrl.Result{Requeue: true}, err + } else { + // TODO(cw): replicate the startup mechanism where pods can fail to be scheduled, + // need to be deleted and re-created + workspace.Status.PodStarts++ + } + r.metrics.rememberWorkspace(workspace) + + case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped: + err := r.Client.Delete(ctx, workspace) + if err != nil { + return ctrl.Result{Requeue: true}, err + } } return ctrl.Result{}, nil @@ -164,6 +192,15 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp return ctrl.Result{Requeue: true}, err } + // if the content initialization failed, delete the pod + case conditionWithStatusAndReson(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") && !isPodBeingDeleted(pod): + err := r.Client.Delete(ctx, pod) + if errors.IsNotFound(err) { + // pod is gone - nothing to do here + } else { + return ctrl.Result{Requeue: true}, err + } + // we've disposed already - try to remove the finalizer and call it a day case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped: var foundFinalizer bool @@ -186,14 +223,56 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp // reque to remove workspace return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } + } - err = r.Client.Delete(ctx, workspace) - if err != nil { - return ctrl.Result{Requeue: true}, err + return ctrl.Result{}, nil +} + +func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *workspacev1.Workspace) { + log := log.FromContext(ctx) + + phase := workspace.Status.Phase + + if !r.metrics.shouldUpdate(&log, workspace) { + return + } + + switch { + case phase == workspacev1.WorkspacePhasePending || + phase == workspacev1.WorkspacePhaseCreating || + phase == workspacev1.WorkspacePhaseInitializing: + + if conditionWithStatusAndReson(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") { + r.metrics.countTotalRestoreFailures(&log, workspace) + r.metrics.countWorkspaceStartFailures(&log, workspace) + } + + if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) { + r.metrics.countWorkspaceStartFailures(&log, workspace) } + + case phase == workspacev1.WorkspacePhaseRunning: + r.metrics.recordWorkspaceStartupTime(&log, workspace) + if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) { + r.metrics.countTotalRestores(&log, workspace) + } + + case phase == workspacev1.WorkspacePhaseStopped: + if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) { + r.metrics.countTotalBackups(&log, workspace) + r.metrics.countTotalBackupFailures(&log, workspace) + } + + if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) { + r.metrics.countTotalBackups(&log, workspace) + } + + r.metrics.countWorkspaceStop(&log, workspace) + r.metrics.forgetWorkspace(workspace) + return } - return ctrl.Result{}, nil + r.metrics.rememberWorkspace(workspace) } func conditionPresentAndTrue(cond []metav1.Condition, tpe string) bool { @@ -205,6 +284,15 @@ func conditionPresentAndTrue(cond []metav1.Condition, tpe string) bool { return false } +func conditionWithStatusAndReson(cond []metav1.Condition, tpe string, status bool, reason string) bool { + for _, c := range cond { + if c.Type == tpe { + return c.Type == tpe && c.Reason == reason + } + } + return false +} + var ( wsOwnerKey = ".metadata.controller" apiGVStr = workspacev1.GroupVersion.String() @@ -240,7 +328,7 @@ func (r *WorkspaceReconciler) SetupWithManager(mgr ctrl.Manager) error { func AddUniqueCondition(conds []metav1.Condition, cond metav1.Condition) []metav1.Condition { if cond.Reason == "" { - cond.Reason = "Foo" + cond.Reason = "unknown" } for i, c := range conds { diff --git a/components/ws-manager-mk2/main.go b/components/ws-manager-mk2/main.go index abb990acbbfa33..c27f0db94622d1 100644 --- a/components/ws-manager-mk2/main.go +++ b/components/ws-manager-mk2/main.go @@ -97,22 +97,24 @@ func main() { os.Exit(1) } - wsmanService, err := setupGRPCService(cfg, mgr.GetClient()) + reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), cfg.Manager, metrics.Registry) if err != nil { - setupLog.Error(err, "unable to start manager service") + setupLog.Error(err, "unable to create controller", "controller", "Workspace") os.Exit(1) } - reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), cfg.Manager) + wsmanService, err := setupGRPCService(cfg, mgr.GetClient()) if err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Workspace") + setupLog.Error(err, "unable to start manager service") os.Exit(1) } + reconciler.OnReconcile = wsmanService.OnWorkspaceReconcile if err = reconciler.SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Workspace") os.Exit(1) } + // if err = (&workspacev1.Workspace{}).SetupWebhookWithManager(mgr); err != nil { // setupLog.Error(err, "unable to create webhook", "webhook", "Workspace") // os.Exit(1) @@ -168,7 +170,7 @@ func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client) (*ser grpcOpts = append(grpcOpts, grpc.UnknownServiceHandler(proxy.TransparentHandler(imagebuilderDirector(cfg.ImageBuilderProxy.TargetAddr)))) - srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager) + srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager, metrics.Registry) grpcServer := grpc.NewServer(grpcOpts...) grpc_prometheus.Register(grpcServer) diff --git a/components/ws-manager-mk2/service/manager.go b/components/ws-manager-mk2/service/manager.go index ba479a4123524a..acb2e7558a5f92 100644 --- a/components/ws-manager-mk2/service/manager.go +++ b/components/ws-manager-mk2/service/manager.go @@ -13,6 +13,7 @@ import ( validation "github.com/go-ozzo/ozzo-validation" "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/xerrors" "google.golang.org/grpc/codes" "google.golang.org/grpc/peer" @@ -40,10 +41,14 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration) *WorkspaceManagerServer { +func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration, reg prometheus.Registerer) *WorkspaceManagerServer { + metrics := newWorkspaceMetrics() + reg.MustRegister(metrics) + return &WorkspaceManagerServer{ - Client: clnt, - Config: cfg, + Client: clnt, + Config: cfg, + metrics: metrics, subs: subscriptions{ subscribers: make(map[string]chan *wsmanapi.SubscribeResponse), }, @@ -51,8 +56,9 @@ func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration) *W } type WorkspaceManagerServer struct { - Client client.Client - Config *config.Configuration + Client client.Client + Config *config.Configuration + metrics *workspaceMetrics subs subscriptions wsmanapi.UnimplementedWorkspaceManagerServer @@ -191,6 +197,8 @@ func (wsm *WorkspaceManagerServer) StartWorkspace(ctx context.Context, req *wsma Ports: ports, }, } + + wsm.metrics.recordWorkspaceStart(&ws) err = wsm.Client.Create(ctx, &ws) if err != nil { log.WithError(err).WithFields(owi).Error("error creating workspace") @@ -836,3 +844,39 @@ func (subs *subscriptions) OnChange(ctx context.Context, status *api.WorkspaceSt log.WithField("status", status).Error("workspace in UNKNOWN phase") } } + +type workspaceMetrics struct { + totalStartsCounterVec *prometheus.CounterVec +} + +func newWorkspaceMetrics() *workspaceMetrics { + return &workspaceMetrics{ + totalStartsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "gitpod", + Subsystem: "ws_manager_mk2", + Name: "workspace_starts_total", + Help: "total number of workspaces started", + }, []string{"type", "class"}), + } +} + +func (m *workspaceMetrics) recordWorkspaceStart(ws *workspacev1.Workspace) { + tpe := string(ws.Spec.Type) + class := ws.Spec.Class + + counter, err := m.totalStartsCounterVec.GetMetricWithLabelValues(tpe, class) + if err != nil { + log.WithError(err).WithField("type", tpe).WithField("class", class) + } + counter.Inc() +} + +// Describe implements Collector. It will send exactly one Desc to the provided channel. +func (m *workspaceMetrics) Describe(ch chan<- *prometheus.Desc) { + m.totalStartsCounterVec.Describe(ch) +} + +// Collect implements Collector. +func (m *workspaceMetrics) Collect(ch chan<- prometheus.Metric) { + m.totalStartsCounterVec.Collect(ch) +}