diff --git a/components/ws-manager/pkg/manager/manager.go b/components/ws-manager/pkg/manager/manager.go index 5ff8cea835090c..4d63fbb2a6ca66 100644 --- a/components/ws-manager/pkg/manager/manager.go +++ b/components/ws-manager/pkg/manager/manager.go @@ -214,7 +214,11 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq } span.LogKV("event", "pod description created") - var createPVC bool + var ( + createPVC bool + pvc *corev1.PersistentVolumeClaim + volumeRestoreTime time.Time + ) for _, feature := range startContext.Request.Spec.FeatureFlags { if feature == api.WorkspaceFeatureFlag_PERSISTENT_VOLUME_CLAIM { createPVC = true @@ -223,14 +227,15 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq } if createPVC { clog.Info("PVC feature detected, creating PVC object") - pvc, err := m.createPVCForWorkspacePod(startContext) + pvc, err = m.createPVCForWorkspacePod(startContext) if err != nil { return nil, xerrors.Errorf("cannot create pvc for workspace pod: %w", err) } err = m.Clientset.Create(ctx, pvc) - if err != nil { + if err != nil && !k8serr.IsAlreadyExists(err) { return nil, xerrors.Errorf("cannot create pvc object for workspace pod: %w", err) } + volumeRestoreTime = time.Now() } // create the Pod in the cluster and wait until is scheduled @@ -261,6 +266,21 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq return false, err } + if createPVC { + err = wait.PollWithContext(ctx, 100*time.Millisecond, time.Minute, pvcRunning(m.Clientset, pvc.Name, pvc.Namespace)) + if err != nil { + return false, nil + } + + wsType := api.WorkspaceType_name[int32(req.Type)] + hist, err := m.metrics.volumeRestoreTimeHistVec.GetMetricWithLabelValues(wsType, req.Spec.Class) + if err != nil { + log.WithError(err).WithField("type", wsType).Warn("cannot get volume restore time histogram metric") + } else { + hist.Observe(time.Since(volumeRestoreTime).Seconds()) + } + } + // wait at least 60 seconds before deleting pending pod and trying again due to pending PVC attachment err = wait.PollWithContext(ctx, 100*time.Millisecond, 60*time.Second, podRunning(m.Clientset, pod.Name, pod.Namespace)) if err != nil { @@ -381,6 +401,20 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi } } +func pvcRunning(clientset client.Client, pvcName, namespace string) wait.ConditionWithContextFunc { + return func(ctx context.Context) (bool, error) { + var pvc corev1.PersistentVolumeClaim + err := clientset.Get(ctx, types.NamespacedName{Namespace: namespace, Name: pvcName}, &pvc) + if err != nil { + return false, nil + } + if pvc.Status.Phase == corev1.ClaimBound { + return true, nil + } + return false, nil + } +} + // validateStartWorkspaceRequest ensures that acting on this request will not leave the system in an invalid state func validateStartWorkspaceRequest(req *api.StartWorkspaceRequest) error { err := validation.ValidateStruct(req.Spec, diff --git a/components/ws-manager/pkg/manager/metrics.go b/components/ws-manager/pkg/manager/metrics.go index e0669ef3dd9277..9653dc2b2801b6 100644 --- a/components/ws-manager/pkg/manager/metrics.go +++ b/components/ws-manager/pkg/manager/metrics.go @@ -42,6 +42,7 @@ type metrics struct { initializeTimeHistVec *prometheus.HistogramVec finalizeTimeHistVec *prometheus.HistogramVec volumeSnapshotTimeHistVec *prometheus.HistogramVec + volumeRestoreTimeHistVec *prometheus.HistogramVec // Counter totalStartsCounterVec *prometheus.CounterVec @@ -90,7 +91,14 @@ func newMetrics(m *Manager) *metrics { Name: "volume_snapshot_seconds", Help: "time it took to snapshot volume", Buckets: prometheus.ExponentialBuckets(2, 2, 10), - }, []string{"type"}), + }, []string{"type", "class"}), + volumeRestoreTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: "volume_restore_seconds", + Help: "time it took to restore volume", + Buckets: prometheus.ExponentialBuckets(2, 2, 10), + }, []string{"type", "class"}), totalStartsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, @@ -178,6 +186,7 @@ func (m *metrics) Register(reg prometheus.Registerer) error { m.initializeTimeHistVec, m.finalizeTimeHistVec, m.volumeSnapshotTimeHistVec, + m.volumeRestoreTimeHistVec, newPhaseTotalVec(m.manager), newWorkspaceActivityVec(m.manager), newTimeoutSettingsVec(m.manager), diff --git a/components/ws-manager/pkg/manager/monitor.go b/components/ws-manager/pkg/manager/monitor.go index 85eea12454e054..163e447a00530f 100644 --- a/components/ws-manager/pkg/manager/monitor.go +++ b/components/ws-manager/pkg/manager/monitor.go @@ -1036,7 +1036,7 @@ func (m *Monitor) finalizeWorkspaceContent(ctx context.Context, wso *workspaceOb return true, nil, err } readyVolumeSnapshot = true - hist, err := m.manager.metrics.volumeSnapshotTimeHistVec.GetMetricWithLabelValues(wsType) + hist, err := m.manager.metrics.volumeSnapshotTimeHistVec.GetMetricWithLabelValues(wsType, wso.Pod.Labels[workspaceClassLabel]) if err != nil { log.WithError(err).WithField("type", wsType).Warn("cannot get volume snapshot time histogram metric") } else {