Skip to content

Add a metric of PVC restore duration time #10623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions components/ws-manager/pkg/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,11 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
}
span.LogKV("event", "pod description created")

var createPVC bool
var (
createPVC bool
pvc *corev1.PersistentVolumeClaim
volumeRestoreTime time.Time
)
for _, feature := range startContext.Request.Spec.FeatureFlags {
if feature == api.WorkspaceFeatureFlag_PERSISTENT_VOLUME_CLAIM {
createPVC = true
Expand All @@ -223,14 +227,15 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
}
if createPVC {
clog.Info("PVC feature detected, creating PVC object")
pvc, err := m.createPVCForWorkspacePod(startContext)
pvc, err = m.createPVCForWorkspacePod(startContext)
if err != nil {
return nil, xerrors.Errorf("cannot create pvc for workspace pod: %w", err)
}
err = m.Clientset.Create(ctx, pvc)
if err != nil {
if err != nil && !k8serr.IsAlreadyExists(err) {
return nil, xerrors.Errorf("cannot create pvc object for workspace pod: %w", err)
}
volumeRestoreTime = time.Now()
}

// create the Pod in the cluster and wait until is scheduled
Expand Down Expand Up @@ -261,6 +266,21 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
return false, err
}

if createPVC {
err = wait.PollWithContext(ctx, 100*time.Millisecond, time.Minute, pvcRunning(m.Clientset, pvc.Name, pvc.Namespace))
if err != nil {
return false, nil
}

wsType := api.WorkspaceType_name[int32(req.Type)]
hist, err := m.metrics.volumeRestoreTimeHistVec.GetMetricWithLabelValues(wsType, req.Spec.Class)
if err != nil {
log.WithError(err).WithField("type", wsType).Warn("cannot get volume restore time histogram metric")
} else {
hist.Observe(time.Since(volumeRestoreTime).Seconds())
}
}

// wait at least 60 seconds before deleting pending pod and trying again due to pending PVC attachment
err = wait.PollWithContext(ctx, 100*time.Millisecond, 60*time.Second, podRunning(m.Clientset, pod.Name, pod.Namespace))
if err != nil {
Expand Down Expand Up @@ -381,6 +401,20 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
}
}

func pvcRunning(clientset client.Client, pvcName, namespace string) wait.ConditionWithContextFunc {
return func(ctx context.Context) (bool, error) {
var pvc corev1.PersistentVolumeClaim
err := clientset.Get(ctx, types.NamespacedName{Namespace: namespace, Name: pvcName}, &pvc)
if err != nil {
return false, nil
}
if pvc.Status.Phase == corev1.ClaimBound {
return true, nil
}
return false, nil
}
}

// validateStartWorkspaceRequest ensures that acting on this request will not leave the system in an invalid state
func validateStartWorkspaceRequest(req *api.StartWorkspaceRequest) error {
err := validation.ValidateStruct(req.Spec,
Expand Down
11 changes: 10 additions & 1 deletion components/ws-manager/pkg/manager/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type metrics struct {
initializeTimeHistVec *prometheus.HistogramVec
finalizeTimeHistVec *prometheus.HistogramVec
volumeSnapshotTimeHistVec *prometheus.HistogramVec
volumeRestoreTimeHistVec *prometheus.HistogramVec

// Counter
totalStartsCounterVec *prometheus.CounterVec
Expand Down Expand Up @@ -90,7 +91,14 @@ func newMetrics(m *Manager) *metrics {
Name: "volume_snapshot_seconds",
Help: "time it took to snapshot volume",
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
}, []string{"type"}),
}, []string{"type", "class"}),
volumeRestoreTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "volume_restore_seconds",
Help: "time it took to restore volume",
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
}, []string{"type", "class"}),
totalStartsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Expand Down Expand Up @@ -178,6 +186,7 @@ func (m *metrics) Register(reg prometheus.Registerer) error {
m.initializeTimeHistVec,
m.finalizeTimeHistVec,
m.volumeSnapshotTimeHistVec,
m.volumeRestoreTimeHistVec,
newPhaseTotalVec(m.manager),
newWorkspaceActivityVec(m.manager),
newTimeoutSettingsVec(m.manager),
Expand Down
2 changes: 1 addition & 1 deletion components/ws-manager/pkg/manager/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,7 @@ func (m *Monitor) finalizeWorkspaceContent(ctx context.Context, wso *workspaceOb
return true, nil, err
}
readyVolumeSnapshot = true
hist, err := m.manager.metrics.volumeSnapshotTimeHistVec.GetMetricWithLabelValues(wsType)
hist, err := m.manager.metrics.volumeSnapshotTimeHistVec.GetMetricWithLabelValues(wsType, wso.Pod.Labels[workspaceClassLabel])
if err != nil {
log.WithError(err).WithField("type", wsType).Warn("cannot get volume snapshot time histogram metric")
} else {
Expand Down