Skip to content

Commit dc5da24

Browse files
authored
Merge pull request kubernetes-sigs#1995 from k8s-infra-cherrypick-robot/cherry-pick-1990-to-release-1.17
[release-1.17] Add exponential backoff retries for getting Node from API server logic
2 parents c435a87 + eda283f commit dc5da24

File tree

3 files changed

+41
-13
lines changed

3 files changed

+41
-13
lines changed

cmd/gce-pd-csi-driver/main.go

+12-7
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,15 @@ func handle() {
245245
if err != nil {
246246
klog.Fatalf("Failed to set up metadata service: %v", err.Error())
247247
}
248+
isDataCacheEnabledNodePool, err := isDataCacheEnabledNodePool(ctx, *nodeName)
249+
if err != nil {
250+
klog.Fatalf("Failed to get node info from API server: %v", err.Error())
251+
}
248252
nsArgs := driver.NodeServerArgs{
249253
EnableDeviceInUseCheck: *enableDeviceInUseCheck,
250254
DeviceInUseTimeout: *deviceInUseTimeout,
251255
EnableDataCache: *enableDataCacheFlag,
252-
DataCacheEnabledNodePool: isDataCacheEnabledNodePool(ctx, *nodeName),
256+
DataCacheEnabledNodePool: isDataCacheEnabledNodePool,
253257
}
254258
nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs)
255259
if *maxConcurrentFormatAndMount > 0 {
@@ -347,14 +351,15 @@ func urlFlag(target **url.URL, name string, usage string) {
347351
})
348352
}
349353

350-
func isDataCacheEnabledNodePool(ctx context.Context, nodeName string) bool {
351-
if nodeName != common.TestNode { // disregard logic below when E2E testing.
354+
func isDataCacheEnabledNodePool(ctx context.Context, nodeName string) (bool, error) {
355+
if !*enableDataCacheFlag {
356+
return false, nil
357+
}
358+
if len(nodeName) > 0 && nodeName != common.TestNode { // disregard logic below when E2E testing.
352359
dataCacheLSSDCount, err := driver.GetDataCacheCountFromNodeLabel(ctx, nodeName)
353-
if err != nil || dataCacheLSSDCount == 0 {
354-
return false
355-
}
360+
return dataCacheLSSDCount != 0, err
356361
}
357-
return true
362+
return true, nil
358363
}
359364

360365
func fetchLssdsForRaiding(lssdCount int) ([]string, error) {

pkg/gce-pd-csi-driver/cache.go

+28-5
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@ import (
77
"regexp"
88
"strconv"
99
"strings"
10+
"time"
1011

1112
csi "github.com/container-storage-interface/spec/lib/go/csi"
1213
fsnotify "github.com/fsnotify/fsnotify"
14+
v1 "k8s.io/api/core/v1"
1315
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16+
"k8s.io/apimachinery/pkg/util/wait"
1417
"k8s.io/client-go/kubernetes"
1518
"k8s.io/client-go/rest"
1619
"k8s.io/klog/v2"
@@ -242,18 +245,15 @@ func ValidateDataCacheConfig(dataCacheMode string, dataCacheSize string, ctx con
242245

243246
func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int, error) {
244247
cfg, err := rest.InClusterConfig()
245-
// We want to capture API errors with node label fetching, so return -1
246-
// in those cases instead of 0.
247248
if err != nil {
248249
return 0, err
249250
}
250251
kubeClient, err := kubernetes.NewForConfig(cfg)
251252
if err != nil {
252253
return 0, err
253254
}
254-
node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
255+
node, err := getNodeWithRetry(ctx, kubeClient, nodeName)
255256
if err != nil {
256-
// We could retry, but this error will also crashloop the driver which may be as good a way to retry as any.
257257
return 0, err
258258
}
259259
if val, found := node.GetLabels()[fmt.Sprintf(common.NodeLabelPrefix, common.DataCacheLssdCountLabel)]; found {
@@ -264,10 +264,33 @@ func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int,
264264
klog.V(4).Infof("Number of local SSDs requested for Data Cache: %v", dataCacheCount)
265265
return dataCacheCount, nil
266266
}
267-
// This will be returned for a non-Data-Cache node pool
268267
return 0, nil
269268
}
270269

270+
func getNodeWithRetry(ctx context.Context, kubeClient *kubernetes.Clientset, nodeName string) (*v1.Node, error) {
271+
var nodeObj *v1.Node
272+
backoff := wait.Backoff{
273+
Duration: 1 * time.Second,
274+
Factor: 2.0,
275+
Steps: 5,
276+
}
277+
err := wait.ExponentialBackoffWithContext(ctx, backoff, func() (bool, error) {
278+
node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
279+
if err != nil {
280+
klog.Warningf("Error getting node %s: %v, retrying...\n", nodeName, err)
281+
return false, nil
282+
}
283+
nodeObj = node
284+
klog.V(4).Infof("Successfully retrieved node info %s\n", nodeName)
285+
return true, nil
286+
})
287+
288+
if err != nil {
289+
klog.Errorf("Failed to get node %s after retries: %v\n", nodeName, err)
290+
}
291+
return nodeObj, err
292+
}
293+
271294
func FetchRaidedLssdCountForDatacache() (int, error) {
272295
raidedPath, err := fetchRAIDedLocalSsdPath()
273296
if err != nil {

test/e2e/utils/utils.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ func GCEClientAndDriverSetup(instance *remote.InstanceInfo, driverConfig DriverC
7373
fmt.Sprintf("--fallback-requisite-zones=%s", strings.Join(driverConfig.Zones, ",")),
7474
}
7575

76+
extra_flags = append(extra_flags, fmt.Sprintf("--node-name=%s", utilcommon.TestNode))
7677
if instance.GetLocalSSD() > 0 {
7778
extra_flags = append(extra_flags, "--enable-data-cache")
78-
extra_flags = append(extra_flags, fmt.Sprintf("--node-name=%s", utilcommon.TestNode))
7979
}
8080
extra_flags = append(extra_flags, fmt.Sprintf("--compute-endpoint=%s", driverConfig.ComputeEndpoint))
8181
extra_flags = append(extra_flags, driverConfig.ExtraFlags...)

0 commit comments

Comments
 (0)