Skip to content

Commit f283133

Browse files
committed
Fix: Add sleep to TestMetricsRefresh for flakes.
The TestMetricsRefresh test in pod_metrics_test.go was flaky due to a race condition. The `StopRefreshLoop` method would signal the metrics refresh goroutine to stop but did not wait for its actual termination. If the test updated the mock metrics client immediately after calling `StopRefreshLoop`, the refresh goroutine could, in rare cases, perform a final metrics fetch with the new data before fully exiting. This resulted in the test asserting against unexpected metric values. This commit resolves the issue by making adding a sleep for the metrics refresh interval in TestMetricsRefresh. Additionally, it adds the following for robustness in `StopRefreshLoop`. - `stopOnce` is used to ensure the `done` channel is only closed once (for idempotency and protection against concurrent calls). This change ensures that the refresh goroutine is guaranteed to have stopped before any test assertions are made, eliminating the race condition.
1 parent 2b2b4a6 commit f283133

File tree

3 files changed

+15
-10
lines changed

3 files changed

+15
-10
lines changed

pkg/epp/backend/metrics/pod_metrics.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@ type podMetrics struct {
4242
ds Datastore
4343
interval time.Duration
4444

45-
once sync.Once // ensure the StartRefreshLoop is only called once.
46-
done chan struct{}
45+
startOnce sync.Once // ensures the refresh loop goroutine is started only once
46+
stopOnce sync.Once // ensures the done channel is closed only once
47+
done chan struct{}
4748

4849
logger logr.Logger
4950
}
@@ -86,7 +87,7 @@ func toInternalPod(pod *corev1.Pod) *backend.Pod {
8687
// start starts a goroutine exactly once to periodically update metrics. The goroutine will be
8788
// stopped either when stop() is called, or the given ctx is cancelled.
8889
func (pm *podMetrics) startRefreshLoop(ctx context.Context) {
89-
pm.once.Do(func() {
90+
pm.startOnce.Do(func() {
9091
go func() {
9192
pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod())
9293
ticker := time.NewTicker(pm.interval)
@@ -138,5 +139,7 @@ func (pm *podMetrics) refreshMetrics() error {
138139

139140
func (pm *podMetrics) StopRefreshLoop() {
140141
pm.logger.V(logutil.DEFAULT).Info("Stopping refresher", "pod", pm.GetPod())
141-
close(pm.done)
142+
pm.stopOnce.Do(func() {
143+
close(pm.done)
144+
})
142145
}

pkg/epp/backend/metrics/pod_metrics_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func TestMetricsRefresh(t *testing.T) {
7878
// Stop the loop, and simulate metric update again, this time the PodMetrics won't get the
7979
// new update.
8080
pm.StopRefreshLoop()
81+
time.Sleep(fetchMetricsTimeout)
8182
pmc.SetRes(map[types.NamespacedName]*MetricsState{namespacedName: updated})
8283
// Still expect the same condition (no metrics update).
8384
assert.EventuallyWithT(t, condition, time.Second, time.Millisecond)

pkg/epp/backend/metrics/types.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,13 @@ type PodMetricsFactory struct {
4242
func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics {
4343
pod := toInternalPod(in)
4444
pm := &podMetrics{
45-
pmc: f.pmc,
46-
ds: ds,
47-
interval: f.refreshMetricsInterval,
48-
once: sync.Once{},
49-
done: make(chan struct{}),
50-
logger: log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
45+
pmc: f.pmc,
46+
ds: ds,
47+
interval: f.refreshMetricsInterval,
48+
startOnce: sync.Once{},
49+
stopOnce: sync.Once{},
50+
done: make(chan struct{}),
51+
logger: log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
5152
}
5253
pm.pod.Store(pod)
5354
pm.metrics.Store(newMetricsState())

0 commit comments

Comments
 (0)