Skip to content

Commit 2d3b1e2

Browse files
Merge pull request #128356 from lauralorenz/crashloopbackoff-maintain10minuterecoverythreshold
KEP-4603: Maintain current 10 minute recovery threshold for container backoff regardless of changes to the maximum duration Kubernetes-commit: ab30adcbae57fc498cb876979e232b422468af9a
2 parents c57e0a8 + ab2cdce commit 2d3b1e2

File tree

4 files changed

+146
-13
lines changed

4 files changed

+146
-13
lines changed

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ require (
2626
golang.org/x/time v0.7.0
2727
google.golang.org/protobuf v1.35.1
2828
gopkg.in/evanphx/json-patch.v4 v4.12.0
29-
k8s.io/api v0.0.0-20241108114310-4772861d607e
29+
k8s.io/api v0.0.0-20241108114313-789a813a3da8
3030
k8s.io/apimachinery v0.0.0-20241106231735-d941d9fb4c83
3131
k8s.io/klog/v2 v2.130.1
3232
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
150150
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
151151
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
152152
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
153-
k8s.io/api v0.0.0-20241108114310-4772861d607e h1:SJxpg9FSTdq3qp/R+LRlZv+xjY+miO+30ZpC7QkSkoM=
154-
k8s.io/api v0.0.0-20241108114310-4772861d607e/go.mod h1:h7yaPC7+0KxMELdLjLoo6n6m3EWq6AeHEY25PjH4cPI=
153+
k8s.io/api v0.0.0-20241108114313-789a813a3da8 h1:+3HQBAIjBgEx+fcUE7qou+b97GTD0FwsRvdivPD4Fk8=
154+
k8s.io/api v0.0.0-20241108114313-789a813a3da8/go.mod h1:h7yaPC7+0KxMELdLjLoo6n6m3EWq6AeHEY25PjH4cPI=
155155
k8s.io/apimachinery v0.0.0-20241106231735-d941d9fb4c83 h1:4KfMPmiiRIpvYJQ8cBYFEFht59EKysW1anuJWzHLHNg=
156156
k8s.io/apimachinery v0.0.0-20241106231735-d941d9fb4c83/go.mod h1:HqhdaJUgQqky29T1V0o2yFkt/pZqLFIDyn9Zi/8rxoY=
157157
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=

util/flowcontrol/backoff.go

+18-10
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,12 @@ type backoffEntry struct {
3232

3333
type Backoff struct {
3434
sync.RWMutex
35-
Clock clock.Clock
35+
Clock clock.Clock
36+
// HasExpiredFunc controls the logic that determines whether the backoff
37+
// counter should be reset, and when to GC old backoff entries. If nil, the
38+
// default hasExpired function will restart the backoff factor to the
39+
// beginning after observing time has passed at least equal to 2*maxDuration
40+
HasExpiredFunc func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool
3641
defaultDuration time.Duration
3742
maxDuration time.Duration
3843
perItemBackoff map[string]*backoffEntry
@@ -93,7 +98,7 @@ func (p *Backoff) Next(id string, eventTime time.Time) {
9398
p.Lock()
9499
defer p.Unlock()
95100
entry, ok := p.perItemBackoff[id]
96-
if !ok || hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
101+
if !ok || p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
97102
entry = p.initEntryUnsafe(id)
98103
entry.backoff += p.jitter(entry.backoff)
99104
} else {
@@ -119,7 +124,7 @@ func (p *Backoff) IsInBackOffSince(id string, eventTime time.Time) bool {
119124
if !ok {
120125
return false
121126
}
122-
if hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
127+
if p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
123128
return false
124129
}
125130
return p.Clock.Since(eventTime) < entry.backoff
@@ -133,21 +138,21 @@ func (p *Backoff) IsInBackOffSinceUpdate(id string, eventTime time.Time) bool {
133138
if !ok {
134139
return false
135140
}
136-
if hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
141+
if p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
137142
return false
138143
}
139144
return eventTime.Sub(entry.lastUpdate) < entry.backoff
140145
}
141146

142-
// Garbage collect records that have aged past maxDuration. Backoff users are expected
143-
// to invoke this periodically.
147+
// Garbage collect records that have aged past their expiration, which defaults
148+
// to 2*maxDuration (see hasExpired godoc). Backoff users are expected to invoke
149+
// this periodically.
144150
func (p *Backoff) GC() {
145151
p.Lock()
146152
defer p.Unlock()
147153
now := p.Clock.Now()
148154
for id, entry := range p.perItemBackoff {
149-
if now.Sub(entry.lastUpdate) > p.maxDuration*2 {
150-
// GC when entry has not been updated for 2*maxDuration
155+
if p.hasExpired(now, entry.lastUpdate, p.maxDuration) {
151156
delete(p.perItemBackoff, id)
152157
}
153158
}
@@ -174,7 +179,10 @@ func (p *Backoff) jitter(delay time.Duration) time.Duration {
174179
return time.Duration(p.rand.Float64() * p.maxJitterFactor * float64(delay))
175180
}
176181

177-
// After 2*maxDuration we restart the backoff factor to the beginning
178-
func hasExpired(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
182+
// Unless an alternate function is provided, after 2*maxDuration we restart the backoff factor to the beginning
183+
func (p *Backoff) hasExpired(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
184+
if p.HasExpiredFunc != nil {
185+
return p.HasExpiredFunc(eventTime, lastUpdate, maxDuration)
186+
}
179187
return eventTime.Sub(lastUpdate) > maxDuration*2 // consider stable if it's ok for twice the maxDuration
180188
}

util/flowcontrol/backoff_test.go

+125
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,67 @@ func TestBackoffGC(t *testing.T) {
125125
}
126126
}
127127

128+
func TestAlternateBackoffGC(t *testing.T) {
129+
cases := []struct {
130+
name string
131+
hasExpiredFunc func(time.Time, time.Time, time.Duration) bool
132+
maxDuration time.Duration
133+
nonExpiredTime time.Duration
134+
expiredTime time.Duration
135+
}{
136+
{
137+
name: "default GC",
138+
maxDuration: time.Duration(50 * time.Second),
139+
nonExpiredTime: time.Duration(5 * time.Second),
140+
expiredTime: time.Duration(101 * time.Second),
141+
},
142+
{
143+
name: "GC later than 2*maxDuration",
144+
hasExpiredFunc: func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
145+
return eventTime.Sub(lastUpdate) >= 200*time.Second
146+
},
147+
maxDuration: time.Duration(50 * time.Second),
148+
nonExpiredTime: time.Duration(101 * time.Second),
149+
expiredTime: time.Duration(501 * time.Second),
150+
},
151+
}
152+
153+
for _, tt := range cases {
154+
clock := testingclock.NewFakeClock(time.Now())
155+
base := time.Second
156+
maxDuration := tt.maxDuration
157+
id := tt.name
158+
159+
b := NewFakeBackOff(base, maxDuration, clock)
160+
if tt.hasExpiredFunc != nil {
161+
b.HasExpiredFunc = tt.hasExpiredFunc
162+
}
163+
164+
// initialize backoff
165+
lastUpdate := clock.Now()
166+
b.Next(id, lastUpdate)
167+
168+
// increment to a time within GC expiration
169+
clock.Step(tt.nonExpiredTime)
170+
b.GC()
171+
172+
// confirm we did not GC this entry
173+
_, found := b.perItemBackoff[id]
174+
if !found {
175+
t.Errorf("[%s] expected GC to skip entry, elapsed time=%s", tt.name, clock.Since(lastUpdate))
176+
}
177+
178+
// increment to a time beyond GC expiration
179+
clock.Step(tt.expiredTime)
180+
b.GC()
181+
r, found := b.perItemBackoff[id]
182+
if found {
183+
t.Errorf("[%s] expected GC of entry after %s got entry %v", tt.name, clock.Since(lastUpdate), r)
184+
}
185+
186+
}
187+
}
188+
128189
func TestIsInBackOffSinceUpdate(t *testing.T) {
129190
id := "_idIsInBackOffSinceUpdate"
130191
tc := testingclock.NewFakeClock(time.Now())
@@ -250,3 +311,67 @@ func TestBackoffWithJitter(t *testing.T) {
250311

251312
t.Logf("exponentially backed off jittered delays: %v", delays)
252313
}
314+
315+
func TestAlternateHasExpiredFunc(t *testing.T) {
316+
cases := []struct {
317+
name string
318+
hasExpiredFunc func(time.Time, time.Time, time.Duration) bool
319+
maxDuration time.Duration
320+
nonExpiredTime time.Duration
321+
expiredTime time.Duration
322+
}{
323+
{
324+
name: "default expiration",
325+
maxDuration: time.Duration(50 * time.Second),
326+
nonExpiredTime: time.Duration(5 * time.Second),
327+
expiredTime: time.Duration(101 * time.Second),
328+
},
329+
{
330+
name: "expires faster than maxDuration",
331+
hasExpiredFunc: func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
332+
return eventTime.Sub(lastUpdate) >= 8*time.Second
333+
},
334+
maxDuration: time.Duration(50 * time.Second),
335+
nonExpiredTime: time.Duration(5 * time.Second),
336+
expiredTime: time.Duration(9 * time.Second),
337+
},
338+
}
339+
340+
for _, tt := range cases {
341+
clock := testingclock.NewFakeClock(time.Now())
342+
base := time.Second
343+
maxDuration := tt.maxDuration
344+
id := tt.name
345+
346+
b := NewFakeBackOff(base, maxDuration, clock)
347+
348+
if tt.hasExpiredFunc != nil {
349+
b.HasExpiredFunc = tt.hasExpiredFunc
350+
}
351+
// initialize backoff
352+
b.Next(id, clock.Now())
353+
354+
// increment to a time within expiration
355+
clock.Step(tt.nonExpiredTime)
356+
b.Next(id, clock.Now())
357+
358+
// confirm we did a backoff
359+
w := b.Get(id)
360+
if w < base*2 {
361+
t.Errorf("case %v: backoff object has not incremented like expected: want %s, got %s", tt.name, base*2, w)
362+
}
363+
364+
// increment to a time beyond expiration
365+
clock.Step(tt.expiredTime)
366+
b.Next(id, clock.Now())
367+
368+
// confirm we have reset the backoff to base
369+
w = b.Get(id)
370+
if w != base {
371+
t.Errorf("case %v: hasexpired value: expected %s (backoff to be reset to initial), got %s", tt.name, base, w)
372+
}
373+
374+
clock.SetTime(time.Now())
375+
b.Reset(id)
376+
}
377+
}

0 commit comments

Comments
 (0)