@@ -20,6 +20,7 @@ import (
20
20
"context"
21
21
"errors"
22
22
"fmt"
23
+ "reflect"
23
24
"sync"
24
25
25
26
corev1 "k8s.io/api/core/v1"
44
45
// The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api)
45
46
type Datastore interface {
46
47
// InferencePool operations
47
- PoolSet (pool * v1alpha2.InferencePool )
48
+ // PoolSet sets the given pool in datastore. If the given pool has different label selector than the previous pool
49
+ // that was stored, the function triggers a resync of the pods to keep the datastore updated. If the given pool
50
+ // is nil, this call triggers the datastore.Clear() function.
51
+ PoolSet (ctx context.Context , client client.Client , pool * v1alpha2.InferencePool ) error
48
52
PoolGet () (* v1alpha2.InferencePool , error )
49
53
PoolHasSynced () bool
50
54
PoolLabelsMatch (podLabels map [string ]string ) bool
@@ -60,10 +64,9 @@ type Datastore interface {
60
64
// PodGetAll returns all pods and metrics, including fresh and stale.
61
65
PodGetAll () []backendmetrics.PodMetrics
62
66
// PodList lists pods matching the given predicate.
63
- PodList (func (backendmetrics.PodMetrics ) bool ) []backendmetrics.PodMetrics
64
- PodUpdateOrAddIfNotExist (pod * corev1.Pod , pool * v1alpha2. InferencePool ) bool
67
+ PodList (predicate func (backendmetrics.PodMetrics ) bool ) []backendmetrics.PodMetrics
68
+ PodUpdateOrAddIfNotExist (pod * corev1.Pod ) bool
65
69
PodDelete (namespacedName types.NamespacedName )
66
- PodResyncAll (ctx context.Context , ctrlClient client.Client , pool * v1alpha2.InferencePool )
67
70
68
71
// Clears the store state, happens when the pool gets deleted.
69
72
Clear ()
@@ -102,10 +105,31 @@ func (ds *datastore) Clear() {
102
105
}
103
106
104
107
// /// InferencePool APIs ///
105
- func (ds * datastore ) PoolSet (pool * v1alpha2.InferencePool ) {
108
+ func (ds * datastore ) PoolSet (ctx context.Context , client client.Client , pool * v1alpha2.InferencePool ) error {
109
+ if pool == nil {
110
+ ds .Clear ()
111
+ return nil
112
+ }
113
+ logger := log .FromContext (ctx )
106
114
ds .poolAndModelsMu .Lock ()
107
115
defer ds .poolAndModelsMu .Unlock ()
116
+
117
+ oldPool := ds .pool
108
118
ds .pool = pool
119
+ if oldPool == nil || ! reflect .DeepEqual (pool .Spec .Selector , oldPool .Spec .Selector ) {
120
+ logger .V (logutil .DEFAULT ).Info ("Updating inference pool endpoints" , "selector" , pool .Spec .Selector )
121
+ // A full resync is required to address two cases:
122
+ // 1) At startup, the pod events may get processed before the pool is synced with the datastore,
123
+ // and hence they will not be added to the store since pool selector is not known yet
124
+ // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need
125
+ // to resync the whole pool: remove pods in the store that don't match the new selector and add
126
+ // the ones that may have existed already to the store.
127
+ if err := ds .podResyncAll (ctx , client ); err != nil {
128
+ return fmt .Errorf ("failed to update pods according to the pool selector - %w" , err )
129
+ }
130
+ }
131
+
132
+ return nil
109
133
}
110
134
111
135
func (ds * datastore ) PoolGet () (* v1alpha2.InferencePool , error ) {
@@ -229,7 +253,7 @@ func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []b
229
253
return res
230
254
}
231
255
232
- func (ds * datastore ) PodUpdateOrAddIfNotExist (pod * corev1.Pod , pool * v1alpha2. InferencePool ) bool {
256
+ func (ds * datastore ) PodUpdateOrAddIfNotExist (pod * corev1.Pod ) bool {
233
257
namespacedName := types.NamespacedName {
234
258
Name : pod .Name ,
235
259
Namespace : pod .Namespace ,
@@ -247,27 +271,35 @@ func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.In
247
271
return ok
248
272
}
249
273
250
- func (ds * datastore ) PodResyncAll (ctx context.Context , ctrlClient client.Client , pool * v1alpha2.InferencePool ) {
274
+ func (ds * datastore ) PodDelete (namespacedName types.NamespacedName ) {
275
+ v , ok := ds .pods .LoadAndDelete (namespacedName )
276
+ if ok {
277
+ pmr := v .(backendmetrics.PodMetrics )
278
+ pmr .StopRefreshLoop ()
279
+ }
280
+ }
281
+
282
+ func (ds * datastore ) podResyncAll (ctx context.Context , ctrlClient client.Client ) error {
251
283
logger := log .FromContext (ctx )
252
284
podList := & corev1.PodList {}
253
285
if err := ctrlClient .List (ctx , podList , & client.ListOptions {
254
- LabelSelector : selectorFromInferencePoolSelector (pool .Spec .Selector ),
255
- Namespace : pool .Namespace ,
286
+ LabelSelector : selectorFromInferencePoolSelector (ds . pool .Spec .Selector ),
287
+ Namespace : ds . pool .Namespace ,
256
288
}); err != nil {
257
- log .FromContext (ctx ).V (logutil .DEFAULT ).Error (err , "Failed to list clients" )
258
- return
289
+ return fmt .Errorf ("failed to list pods - %w" , err )
259
290
}
260
291
261
292
activePods := make (map [string ]bool )
262
293
for _ , pod := range podList .Items {
263
- if podutil .IsPodReady (& pod ) {
264
- namespacedName := types.NamespacedName {Name : pod .Name , Namespace : pod .Namespace }
265
- activePods [pod .Name ] = true
266
- if ds .PodUpdateOrAddIfNotExist (& pod , pool ) {
267
- logger .V (logutil .DEFAULT ).Info ("Pod added" , "name" , namespacedName )
268
- } else {
269
- logger .V (logutil .DEFAULT ).Info ("Pod already exists" , "name" , namespacedName )
270
- }
294
+ if ! podutil .IsPodReady (& pod ) {
295
+ continue
296
+ }
297
+ namespacedName := types.NamespacedName {Name : pod .Name , Namespace : pod .Namespace }
298
+ activePods [pod .Name ] = true
299
+ if ds .PodUpdateOrAddIfNotExist (& pod ) {
300
+ logger .V (logutil .DEFAULT ).Info ("Pod added" , "name" , namespacedName )
301
+ } else {
302
+ logger .V (logutil .DEFAULT ).Info ("Pod already exists" , "name" , namespacedName )
271
303
}
272
304
}
273
305
@@ -281,14 +313,8 @@ func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client,
281
313
return true
282
314
}
283
315
ds .pods .Range (deleteFn )
284
- }
285
316
286
- func (ds * datastore ) PodDelete (namespacedName types.NamespacedName ) {
287
- v , ok := ds .pods .LoadAndDelete (namespacedName )
288
- if ok {
289
- pmr := v .(backendmetrics.PodMetrics )
290
- pmr .StopRefreshLoop ()
291
- }
317
+ return nil
292
318
}
293
319
294
320
func selectorFromInferencePoolSelector (selector map [v1alpha2.LabelKey ]v1alpha2.LabelValue ) labels.Selector {
0 commit comments