@@ -7,10 +7,13 @@ import (
7
7
"regexp"
8
8
"strconv"
9
9
"strings"
10
+ "time"
10
11
11
12
csi "github.com/container-storage-interface/spec/lib/go/csi"
12
13
fsnotify "github.com/fsnotify/fsnotify"
14
+ v1 "k8s.io/api/core/v1"
13
15
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16
+ "k8s.io/apimachinery/pkg/util/wait"
14
17
"k8s.io/client-go/kubernetes"
15
18
"k8s.io/client-go/rest"
16
19
"k8s.io/klog/v2"
@@ -242,18 +245,15 @@ func ValidateDataCacheConfig(dataCacheMode string, dataCacheSize string, ctx con
242
245
243
246
func GetDataCacheCountFromNodeLabel (ctx context.Context , nodeName string ) (int , error ) {
244
247
cfg , err := rest .InClusterConfig ()
245
- // We want to capture API errors with node label fetching, so return -1
246
- // in those cases instead of 0.
247
248
if err != nil {
248
249
return 0 , err
249
250
}
250
251
kubeClient , err := kubernetes .NewForConfig (cfg )
251
252
if err != nil {
252
253
return 0 , err
253
254
}
254
- node , err := kubeClient . CoreV1 (). Nodes (). Get ( ctx , nodeName , metav1. GetOptions {} )
255
+ node , err := getNodeWithRetry ( ctx , kubeClient , nodeName )
255
256
if err != nil {
256
- // We could retry, but this error will also crashloop the driver which may be as good a way to retry as any.
257
257
return 0 , err
258
258
}
259
259
if val , found := node .GetLabels ()[fmt .Sprintf (common .NodeLabelPrefix , common .DataCacheLssdCountLabel )]; found {
@@ -264,10 +264,33 @@ func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int,
264
264
klog .V (4 ).Infof ("Number of local SSDs requested for Data Cache: %v" , dataCacheCount )
265
265
return dataCacheCount , nil
266
266
}
267
- // This will be returned for a non-Data-Cache node pool
268
267
return 0 , nil
269
268
}
270
269
270
+ func getNodeWithRetry (ctx context.Context , kubeClient * kubernetes.Clientset , nodeName string ) (* v1.Node , error ) {
271
+ var nodeObj * v1.Node
272
+ backoff := wait.Backoff {
273
+ Duration : 1 * time .Second ,
274
+ Factor : 2.0 ,
275
+ Steps : 5 ,
276
+ }
277
+ err := wait .ExponentialBackoffWithContext (ctx , backoff , func () (bool , error ) {
278
+ node , err := kubeClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
279
+ if err != nil {
280
+ klog .Warningf ("Error getting node %s: %v, retrying...\n " , nodeName , err )
281
+ return false , nil
282
+ }
283
+ nodeObj = node
284
+ klog .V (4 ).Infof ("Successfully retrieved node info %s\n " , nodeName )
285
+ return true , nil
286
+ })
287
+
288
+ if err != nil {
289
+ klog .Errorf ("Failed to get node %s after retries: %v\n " , nodeName , err )
290
+ }
291
+ return nodeObj , err
292
+ }
293
+
271
294
func FetchRaidedLssdCountForDatacache () (int , error ) {
272
295
raidedPath , err := fetchRAIDedLocalSsdPath ()
273
296
if err != nil {
0 commit comments