@@ -20,6 +20,7 @@ import (
20
20
"context"
21
21
"errors"
22
22
"fmt"
23
+ "reflect"
23
24
"sync"
24
25
25
26
corev1 "k8s.io/api/core/v1"
44
45
// The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api)
45
46
type Datastore interface {
46
47
// InferencePool operations
47
- PoolSet (pool * v1alpha2.InferencePool )
48
+ PoolSet (ctx context. Context , client client. Client , pool * v1alpha2.InferencePool ) error
48
49
PoolGet () (* v1alpha2.InferencePool , error )
49
50
PoolHasSynced () bool
50
51
PoolLabelsMatch (podLabels map [string ]string ) bool
@@ -63,7 +64,6 @@ type Datastore interface {
63
64
PodList (func (backendmetrics.PodMetrics ) bool ) []backendmetrics.PodMetrics
64
65
PodUpdateOrAddIfNotExist (pod * corev1.Pod , pool * v1alpha2.InferencePool ) bool
65
66
PodDelete (namespacedName types.NamespacedName )
66
- PodResyncAll (ctx context.Context , ctrlClient client.Client , pool * v1alpha2.InferencePool )
67
67
68
68
// Clears the store state, happens when the pool gets deleted.
69
69
Clear ()
@@ -102,10 +102,31 @@ func (ds *datastore) Clear() {
102
102
}
103
103
104
104
// /// InferencePool APIs ///
105
- func (ds * datastore ) PoolSet (pool * v1alpha2.InferencePool ) {
105
+ func (ds * datastore ) PoolSet (ctx context.Context , client client.Client , pool * v1alpha2.InferencePool ) error {
106
+ if pool == nil {
107
+ ds .Clear ()
108
+ return nil
109
+ }
110
+ logger := log .FromContext (ctx )
106
111
ds .poolAndModelsMu .Lock ()
107
112
defer ds .poolAndModelsMu .Unlock ()
113
+
114
+ oldPool := ds .pool
108
115
ds .pool = pool
116
+ if oldPool == nil || ! reflect .DeepEqual (pool .Spec .Selector , oldPool .Spec .Selector ) {
117
+ logger .V (logutil .DEFAULT ).Info ("Updating inference pool endpoints" , "selector" , pool .Spec .Selector )
118
+ // A full resync is required to address two cases:
119
+ // 1) At startup, the pod events may get processed before the pool is synced with the datastore,
120
+ // and hence they will not be added to the store since pool selector is not known yet
121
+ // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need
122
+ // to resync the whole pool: remove pods in the store that don't match the new selector and add
123
+ // the ones that may have existed already to the store.
124
+ if err := ds .podResyncAll (ctx , client ); err != nil {
125
+ return fmt .Errorf ("failed to update pods to match the updated pool selector" )
126
+ }
127
+ }
128
+
129
+ return nil
109
130
}
110
131
111
132
func (ds * datastore ) PoolGet () (* v1alpha2.InferencePool , error ) {
@@ -247,27 +268,35 @@ func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.In
247
268
return ok
248
269
}
249
270
250
- func (ds * datastore ) PodResyncAll (ctx context.Context , ctrlClient client.Client , pool * v1alpha2.InferencePool ) {
271
+ func (ds * datastore ) PodDelete (namespacedName types.NamespacedName ) {
272
+ v , ok := ds .pods .LoadAndDelete (namespacedName )
273
+ if ok {
274
+ pmr := v .(backendmetrics.PodMetrics )
275
+ pmr .StopRefreshLoop ()
276
+ }
277
+ }
278
+
279
+ func (ds * datastore ) podResyncAll (ctx context.Context , ctrlClient client.Client ) error {
251
280
logger := log .FromContext (ctx )
252
281
podList := & corev1.PodList {}
253
282
if err := ctrlClient .List (ctx , podList , & client.ListOptions {
254
- LabelSelector : selectorFromInferencePoolSelector (pool .Spec .Selector ),
255
- Namespace : pool .Namespace ,
283
+ LabelSelector : selectorFromInferencePoolSelector (ds . pool .Spec .Selector ),
284
+ Namespace : ds . pool .Namespace ,
256
285
}); err != nil {
257
- log .FromContext (ctx ).V (logutil .DEFAULT ).Error (err , "Failed to list clients" )
258
- return
286
+ return fmt .Errorf ("failed to list pods - %w" , err )
259
287
}
260
288
261
289
activePods := make (map [string ]bool )
262
290
for _ , pod := range podList .Items {
263
- if podutil .IsPodReady (& pod ) {
264
- namespacedName := types.NamespacedName {Name : pod .Name , Namespace : pod .Namespace }
265
- activePods [pod .Name ] = true
266
- if ds .PodUpdateOrAddIfNotExist (& pod , pool ) {
267
- logger .V (logutil .DEFAULT ).Info ("Pod added" , "name" , namespacedName )
268
- } else {
269
- logger .V (logutil .DEFAULT ).Info ("Pod already exists" , "name" , namespacedName )
270
- }
291
+ if ! podutil .IsPodReady (& pod ) {
292
+ continue
293
+ }
294
+ namespacedName := types.NamespacedName {Name : pod .Name , Namespace : pod .Namespace }
295
+ activePods [pod .Name ] = true
296
+ if ds .PodUpdateOrAddIfNotExist (& pod , ds .pool ) {
297
+ logger .V (logutil .DEFAULT ).Info ("Pod added" , "name" , namespacedName )
298
+ } else {
299
+ logger .V (logutil .DEFAULT ).Info ("Pod already exists" , "name" , namespacedName )
271
300
}
272
301
}
273
302
@@ -281,14 +310,8 @@ func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client,
281
310
return true
282
311
}
283
312
ds .pods .Range (deleteFn )
284
- }
285
313
286
- func (ds * datastore ) PodDelete (namespacedName types.NamespacedName ) {
287
- v , ok := ds .pods .LoadAndDelete (namespacedName )
288
- if ok {
289
- pmr := v .(backendmetrics.PodMetrics )
290
- pmr .StopRefreshLoop ()
291
- }
314
+ return nil
292
315
}
293
316
294
317
func selectorFromInferencePoolSelector (selector map [v1alpha2.LabelKey ]v1alpha2.LabelValue ) labels.Selector {
0 commit comments