@@ -2,6 +2,7 @@ package backend
2
2
3
3
import (
4
4
"context"
5
+ "errors"
5
6
"fmt"
6
7
"sync"
7
8
"time"
@@ -10,10 +11,6 @@ import (
10
11
klog "k8s.io/klog/v2"
11
12
)
12
13
13
- const (
14
- fetchMetricsTimeout = 5 * time .Second
15
- )
16
-
17
14
func NewProvider (pmc PodMetricsClient , datastore * K8sDatastore ) * Provider {
18
15
p := & Provider {
19
16
podMetrics : sync.Map {},
@@ -35,23 +32,46 @@ type PodMetricsClient interface {
35
32
FetchMetrics (ctx context.Context , pod Pod , existing * PodMetrics ) (* PodMetrics , error )
36
33
}
37
34
35
+ func isPodMetricsStale (pm * PodMetrics ) bool {
36
+ // TODO: make it configurable
37
+ return time .Since (pm .UpdatedTime ) > 5 * time .Second
38
+ }
39
+
38
40
func (p * Provider ) AllPodMetrics () []* PodMetrics {
41
+ return p .allPodMetrics (false )
42
+ }
43
+
44
+ func (p * Provider ) AllPodMetricsIncludingStale () []* PodMetrics {
45
+ return p .allPodMetrics (true )
46
+ }
47
+
48
+ func (p * Provider ) allPodMetrics (staleIncluded bool ) []* PodMetrics {
39
49
res := []* PodMetrics {}
40
50
fn := func (k , v any ) bool {
41
- res = append (res , v .(* PodMetrics ))
51
+ m := v .(* PodMetrics )
52
+
53
+ if ! staleIncluded && isPodMetricsStale (m ) {
54
+ // exclude stale metrics for scheduler
55
+ klog .V (4 ).Infof ("Pod metrics for %s is stale, skipping" , m .Pod )
56
+ return true
57
+ }
58
+
59
+ res = append (res , m )
42
60
return true
43
61
}
44
62
p .podMetrics .Range (fn )
45
63
return res
46
64
}
47
65
48
66
func (p * Provider ) UpdatePodMetrics (pod Pod , pm * PodMetrics ) {
67
+ pm .UpdatedTime = time .Now ()
49
68
p .podMetrics .Store (pod , pm )
50
69
}
51
70
52
71
func (p * Provider ) GetPodMetrics (pod Pod ) (* PodMetrics , bool ) {
53
72
val , ok := p .podMetrics .Load (pod )
54
73
if ok {
74
+ // For now, we don't exclude stale metrics with GET operation.
55
75
return val .(* PodMetrics ), true
56
76
}
57
77
return nil , false
@@ -60,11 +80,11 @@ func (p *Provider) GetPodMetrics(pod Pod) (*PodMetrics, bool) {
60
80
func (p * Provider ) Init (refreshPodsInterval , refreshMetricsInterval time.Duration ) error {
61
81
p .refreshPodsOnce ()
62
82
63
- if err := p .refreshMetricsOnce (); err != nil {
83
+ if err := p .refreshMetricsOnce (refreshMetricsInterval ); err != nil {
64
84
klog .Errorf ("Failed to init metrics: %v" , err )
65
85
}
66
86
67
- klog .Infof ("Initialized pods and metrics: %+v" , p .AllPodMetrics ())
87
+ klog .Infof ("Initialized pods and metrics: %+v" , p .AllPodMetricsIncludingStale ())
68
88
69
89
// periodically refresh pods
70
90
go func () {
@@ -76,10 +96,18 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duratio
76
96
77
97
// periodically refresh metrics
78
98
go func () {
99
+ time .Sleep (refreshMetricsInterval )
79
100
for {
80
- time .Sleep (refreshMetricsInterval )
81
- if err := p .refreshMetricsOnce (); err != nil {
82
- klog .V (4 ).Infof ("Failed to refresh metrics: %v" , err )
101
+ start := time .Now ()
102
+
103
+ if err := p .refreshMetricsOnce (refreshMetricsInterval ); err != nil {
104
+ klog .Errorf ("Failed to refresh metrics: %v" , err )
105
+ }
106
+
107
+ now := time .Now ()
108
+ used := now .Sub (start )
109
+ if used < refreshMetricsInterval {
110
+ time .Sleep (refreshMetricsInterval - used )
83
111
}
84
112
}
85
113
}()
@@ -89,7 +117,7 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duratio
89
117
go func () {
90
118
for {
91
119
time .Sleep (5 * time .Second )
92
- klog .Infof ("===DEBUG: Current Pods and metrics: %+v" , p .AllPodMetrics ())
120
+ klog .Infof ("===DEBUG: Current Pods and metrics: %+v" , p .AllPodMetricsIncludingStale ())
93
121
}
94
122
}()
95
123
}
@@ -127,8 +155,8 @@ func (p *Provider) refreshPodsOnce() {
127
155
p .datastore .pods .Range (addNewPods )
128
156
}
129
157
130
- func (p * Provider ) refreshMetricsOnce () error {
131
- ctx , cancel := context .WithTimeout (context .Background (), fetchMetricsTimeout )
158
+ func (p * Provider ) refreshMetricsOnce (interval time. Duration ) error {
159
+ ctx , cancel := context .WithTimeout (context .Background (), interval )
132
160
defer cancel ()
133
161
start := time .Now ()
134
162
defer func () {
@@ -147,7 +175,12 @@ func (p *Provider) refreshMetricsOnce() error {
147
175
defer wg .Done ()
148
176
updated , err := p .pmc .FetchMetrics (ctx , pod , existing )
149
177
if err != nil {
150
- errCh <- fmt .Errorf ("failed to parse metrics from %s: %v" , pod , err )
178
+ // handle timeout error as less severe error
179
+ if errors .Is (err , context .Canceled ) {
180
+ klog .V (4 ).Infof ("Timeout fetching metrics for pod %s" , pod )
181
+ } else {
182
+ errCh <- fmt .Errorf ("failed to fetch metrics from %s: %v" , pod , err )
183
+ }
151
184
return
152
185
}
153
186
p .UpdatePodMetrics (pod , updated )
0 commit comments