Fix: Add sleep to TestMetricsRefresh for flakes.

LukeAVanDrie · LukeAVanDrie · commit f2831336b6f2 · 2025-05-13T21:25:37.000Z
The TestMetricsRefresh test in pod_metrics_test.go was flaky due to a
race condition. The `StopRefreshLoop` method would signal the metrics
refresh goroutine to stop but did not wait for its actual termination.
If the test updated the mock metrics client immediately after calling
`StopRefreshLoop`, the refresh goroutine could, in rare cases, perform
a final metrics fetch with the new data before fully exiting. This
resulted in the test asserting against unexpected metric values.

This commit resolves the issue by making adding a sleep for the metrics
refresh interval in TestMetricsRefresh. Additionally, it adds the
following for robustness in `StopRefreshLoop`.
- `stopOnce` is used to ensure the `done` channel is only closed once
  (for idempotency and protection against concurrent calls).

This change ensures that the refresh goroutine is guaranteed to have
stopped before any test assertions are made, eliminating the race
condition.
diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go
@@ -42,8 +42,9 @@ type podMetrics struct {
 	ds       Datastore
 	interval time.Duration
 
-	once sync.Once // ensure the StartRefreshLoop is only called once.
-	done chan struct{}
+	startOnce sync.Once // ensures the refresh loop goroutine is started only once
+	stopOnce  sync.Once // ensures the done channel is closed only once
+	done      chan struct{}
 
 	logger logr.Logger
 }
@@ -86,7 +87,7 @@ func toInternalPod(pod *corev1.Pod) *backend.Pod {
 // start starts a goroutine exactly once to periodically update metrics. The goroutine will be
 // stopped either when stop() is called, or the given ctx is cancelled.
 func (pm *podMetrics) startRefreshLoop(ctx context.Context) {
-	pm.once.Do(func() {
+	pm.startOnce.Do(func() {
 		go func() {
 			pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod())
 			ticker := time.NewTicker(pm.interval)
@@ -138,5 +139,7 @@ func (pm *podMetrics) refreshMetrics() error {
 
 func (pm *podMetrics) StopRefreshLoop() {
 	pm.logger.V(logutil.DEFAULT).Info("Stopping refresher", "pod", pm.GetPod())
-	close(pm.done)
+	pm.stopOnce.Do(func() {
+		close(pm.done)
+	})
 }
diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go
@@ -78,6 +78,7 @@ func TestMetricsRefresh(t *testing.T) {
 	// Stop the loop, and simulate metric update again, this time the PodMetrics won't get the
 	// new update.
 	pm.StopRefreshLoop()
+	time.Sleep(fetchMetricsTimeout)
 	pmc.SetRes(map[types.NamespacedName]*MetricsState{namespacedName: updated})
 	// Still expect the same condition (no metrics update).
 	assert.EventuallyWithT(t, condition, time.Second, time.Millisecond)
diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go
@@ -42,12 +42,13 @@ type PodMetricsFactory struct {
 func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics {
 	pod := toInternalPod(in)
 	pm := &podMetrics{
-		pmc:      f.pmc,
-		ds:       ds,
-		interval: f.refreshMetricsInterval,
-		once:     sync.Once{},
-		done:     make(chan struct{}),
-		logger:   log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
+		pmc:       f.pmc,
+		ds:        ds,
+		interval:  f.refreshMetricsInterval,
+		startOnce: sync.Once{},
+		stopOnce:  sync.Once{},
+		done:      make(chan struct{}),
+		logger:    log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
 	}
 	pm.pod.Store(pod)
 	pm.metrics.Store(newMetricsState())