Skip to content

Commit e33bb0d

Browse files
committed
update test setup to avoid running Datacache setup on machines not supporting LSSDs
1 parent 1566da5 commit e33bb0d

File tree

6 files changed

+59
-51
lines changed

6 files changed

+59
-51
lines changed

cmd/gce-pd-csi-driver/main.go

+15-15
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ var (
7575
fallbackRequisiteZonesFlag = flag.String("fallback-requisite-zones", "", "Comma separated list of requisite zones that will be used if there are not sufficient zones present in requisite topologies when provisioning a disk")
7676
enableStoragePoolsFlag = flag.Bool("enable-storage-pools", false, "If set to true, the CSI Driver will allow volumes to be provisioned in Storage Pools")
7777
enableHdHAFlag = flag.Bool("allow-hdha-provisioning", false, "If set to true, will allow the driver to provision Hyperdisk-balanced High Availability disks")
78-
enableDataCacheFlag = flag.Bool("enable-data-cache", false, "If set to true, the CSI Driver will allow volumes to be provisioned with data cache configuration")
78+
enableDataCacheFlag = flag.Bool("enable-data-cache", false, "If set to true, the CSI Driver will allow volumes to be provisioned with Data Cache configuration")
7979
nodeName = flag.String("node-name", "", "The node this driver is running on")
8080

8181
multiZoneVolumeHandleDiskTypesFlag = flag.String("multi-zone-volume-handle-disk-types", "", "Comma separated list of allowed disk types that can use the multi-zone volumeHandle. Used only if --multi-zone-volume-handle-enable")
@@ -130,7 +130,7 @@ func handle() {
130130
if version == "" {
131131
klog.Fatalf("version must be set at compile time")
132132
}
133-
klog.V(2).Infof("Driver vendor version %v", version)
133+
klog.V(4).Infof("Driver vendor version %v", version)
134134

135135
// Start tracing as soon as possible
136136
if *enableOtelTracing {
@@ -258,10 +258,10 @@ func handle() {
258258

259259
if *enableDataCacheFlag {
260260
if nodeName == nil || *nodeName == "" {
261-
klog.Errorf("Data cache enabled, but --node-name not passed")
261+
klog.Errorf("Data Cache enabled, but --node-name not passed")
262262
}
263263
if err := setupDataCache(ctx, *nodeName); err != nil {
264-
klog.Errorf("DataCache setup failed: %v", err)
264+
klog.Errorf("Data Cache setup failed: %v", err)
265265
}
266266
}
267267

@@ -370,7 +370,7 @@ func fetchLssdsForRaiding(lssdCount int) ([]string, error) {
370370
return nil, fmt.Errorf("Error listing LSSDs with empty mountpoint: %v", err)
371371
}
372372

373-
// We need to ensure the disks to be used for Datacache are both unRAIDed & not containing mountpoints for ephemeral storage already
373+
// We need to ensure the disks to be used for Data Cache are both unRAIDed & not containing mountpoints for ephemeral storage already
374374
availableLssds := slices.Filter(nil, unRaidedLssds, func(e string) bool {
375375
return slices.Contains(LSSDsWithEmptyMountPoint, e)
376376
})
@@ -388,33 +388,33 @@ func fetchLssdsForRaiding(lssdCount int) ([]string, error) {
388388
func setupDataCache(ctx context.Context, nodeName string) error {
389389
isAlreadyRaided, err := driver.IsRaided()
390390
if err != nil {
391-
klog.V(2).Infof("Errored while scanning for available LocalSSDs err:%v; continuing Raiding", err)
391+
klog.V(4).Infof("Errored while scanning for available LocalSSDs err:%v; continuing Raiding", err)
392392
} else if isAlreadyRaided {
393-
klog.V(2).Infof("Local SSDs are already RAIDed. Skipping Datacache setup.")
393+
klog.V(4).Infof("Local SSDs are already RAIDed. Skipping Data Cache setup.")
394394
return nil
395395
}
396396

397397
lssdCount := common.LocalSSDCountForDataCache
398398
if nodeName != common.TestNode {
399399
var err error
400400
lssdCount, err = driver.GetDataCacheCountFromNodeLabel(ctx, nodeName)
401-
if lssdCount == 0 {
402-
klog.Infof("Datacache is not enabled on node %v", nodeName)
403-
return nil
404-
}
405401
if err != nil {
406402
return err
407403
}
404+
if lssdCount == 0 {
405+
klog.V(4).Infof("Data Cache is not enabled on node %v, so skipping caching setup", nodeName)
406+
return nil
407+
}
408408
}
409409
lssdNames, err := fetchLssdsForRaiding(lssdCount)
410410
if err != nil {
411-
klog.Fatalf("Failed to get sufficient SSDs for Datacache's caching setup: %v", err)
411+
klog.Fatalf("Failed to get sufficient SSDs for Data Cache's caching setup: %v", err)
412412
}
413-
klog.V(2).Infof("Raiding local ssds to setup data cache: %v", lssdNames)
413+
klog.V(4).Infof("Raiding local ssds to setup Data Cache: %v", lssdNames)
414414
if err := driver.RaidLocalSsds(lssdNames); err != nil {
415-
return fmt.Errorf("Failed to Raid local SSDs, unable to setup data caching, got error %v", err)
415+
return fmt.Errorf("Failed to Raid local SSDs, unable to setup Data Cache, got error %v", err)
416416
}
417417

418-
klog.V(2).Infof("Datacache enabled for node %s", nodeName)
418+
klog.V(4).Infof("LSSD caching is setup for the Data Cache enabled node %s", nodeName)
419419
return nil
420420
}

pkg/common/constants.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ const (
4848
// Default LSSD count for datacache E2E tests
4949
LocalSSDCountForDataCache = 2
5050

51-
// Node label for datacache
51+
// Node label for Data Cache (only applicable to GKE nodes)
5252
NodeLabelPrefix = "cloud.google.com/%s"
5353
DataCacheLssdCountLabel = "gke-data-cache-disk"
5454
)

pkg/gce-pd-csi-driver/cache.go

+33-32
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,10 @@ import (
1616
)
1717

1818
const (
19-
cacheSuffix = "csi-fast"
20-
mainLvSuffix = "csi-main"
21-
raidedLocalSsdName = "csi-driver-data-cache"
22-
raidMode = "0"
23-
initialRaidedLocalSsdPath = "/dev/md0"
19+
cacheSuffix = "csi-fast"
20+
mainLvSuffix = "csi-main"
21+
raidedLocalSsdName = "csi-driver-data-cache"
22+
raidMode = "0"
2423
)
2524

2625
func fetchRAIDedLocalSsdPath() (string, error) {
@@ -30,12 +29,13 @@ func fetchRAIDedLocalSsdPath() (string, error) {
3029
}
3130
info, err := common.RunCommand("grep", []string{raidedLocalSsdName}, "mdadm", args...)
3231
if err != nil || len(info) == 0 {
33-
return "", fmt.Errorf("Error getting RAIDed device path for Datacache %v, output:%v ===============", err, string(info))
32+
return "", fmt.Errorf("Error getting RAIDed device path for Data Cache %v, output:%v", err, string(info))
3433
}
3534
infoString := strings.TrimSpace(string(info))
3635
infoSlice := strings.Split(infoString, " ")
3736

38-
// We want to get the second element in the array, which is the path to the RAIDed device
37+
// We want to get the second element in the array (sample: ARRAY /dev/md126 metadata=1.2 name=csi-driver-data-cache UUID=*),
38+
// which is the path to the RAIDed device
3939
return infoSlice[1], nil
4040
}
4141

@@ -51,7 +51,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
5151
volumeGroupName := getVolumeGroupName(nodeId)
5252
mainDevicePath := "/dev/" + volumeGroupName + "/" + getLvName(mainLvSuffix, volumeId)
5353
mainLvName := getLvName(mainLvSuffix, volumeId)
54-
klog.V(2).Infof("Volume group available on node %v ", volumeGroupName)
54+
klog.V(4).Infof("Volume group available on node %v ", volumeGroupName)
5555
vgExists := checkVgExists(volumeGroupName)
5656
if vgExists {
5757
// Clean up Volume Group before adding the PD
@@ -82,9 +82,9 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
8282
infoString = strings.ReplaceAll(infoString, "\"", "")
8383
infoSlice := strings.Split(strings.TrimSpace(infoString), " ")
8484
vgNameForPv := strings.TrimSpace(infoSlice[(len(infoSlice) - 1)])
85-
klog.V(2).Infof("============================== Physical volume is part of Volume group: %v ==============================", vgNameForPv)
85+
klog.V(4).Infof("Physical volume is part of Volume group: %v", vgNameForPv)
8686
if vgNameForPv == volumeGroupName {
87-
klog.V(2).Infof("============================== Physical Volume(PV) already exists in the Volume Group ==============================")
87+
klog.V(4).Infof("Physical Volume(PV) already exists in the Volume Group")
8888
} else if vgNameForPv != "VG" && vgNameForPv != "" {
8989

9090
info, err = common.RunCommand("" /* pipedCmd */, nil /* pipedCmdArg */, "vgchange", []string{"-an", vgNameForPv}...)
@@ -157,7 +157,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
157157
cacheLvName := getLvName(cacheSuffix, volumeId)
158158
if isCached {
159159
// Validate that cache is setup for required size
160-
klog.V(2).Infof("Assuming valid data cache size and mode, resizing cache is not supported")
160+
klog.V(4).Infof("Assuming valid data cache size and mode, resizing cache is not supported")
161161
} else {
162162
fastCacheSize := req.GetPublishContext()[common.ContextDataCacheSize]
163163
chunkSize := "960" // Cannot use default chunk size(64KiB) as it errors on maxChunksAllowed. Unit - KiB
@@ -207,8 +207,8 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
207207
return mainDevicePath, nil
208208
}
209209

210-
func ValidateDataCacheConfig(dataCacheMode string, datacacheSize string, ctx context.Context, nodeName string) error {
211-
if dataCacheMode != "" && datacacheSize != "" {
210+
func ValidateDataCacheConfig(dataCacheMode string, dataCacheSize string, ctx context.Context, nodeName string) error {
211+
if dataCacheMode != "" && dataCacheSize != "" {
212212
isAlreadyRaided, err := IsRaided()
213213
if err != nil {
214214
return fmt.Errorf("Local SSDs are not setup for caching; got error: %v", err)
@@ -218,48 +218,50 @@ func ValidateDataCacheConfig(dataCacheMode string, datacacheSize string, ctx con
218218
}
219219
return nil
220220
}
221-
klog.Infof("Data cache is not enabled for PVC")
221+
klog.V(4).Infof("Data Cache is not enabled for PVC (data-cache-size: %v, data-cache-mode: %v). Please set both these parameters in StorageClass to enable caching", dataCacheSize, dataCacheMode)
222222
return nil
223223
}
224224

225225
func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int, error) {
226-
if nodeName == common.TestNode {
227-
return common.LocalSSDCountForDataCache, nil
228-
}
229226
cfg, err := rest.InClusterConfig()
230227
// We want to capture API errors with node label fetching, so return -1
231228
// in those cases instead of 0.
232229
if err != nil {
233-
return -1, err
230+
return 0, err
234231
}
235232
kubeClient, err := kubernetes.NewForConfig(cfg)
236233
if err != nil {
237-
return -1, err
234+
return 0, err
238235
}
239236
node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
240237
if err != nil {
241238
// We could retry, but this error will also crashloop the driver which may be as good a way to retry as any.
242-
return -1, err
239+
return 0, err
243240
}
244241
if val, found := node.GetLabels()[fmt.Sprintf(common.NodeLabelPrefix, common.DataCacheLssdCountLabel)]; found {
245242
dataCacheCount, err := strconv.Atoi(val)
246243
if err != nil {
247-
return -1, fmt.Errorf("Error getting Datacache's LSSD count from node label: %v", err)
244+
return 0, fmt.Errorf("Error getting Data Cache's LSSD count from node label: %v", err)
248245
}
249-
klog.Infof("Number of local SSDs requested for Datacache: %v", dataCacheCount)
246+
klog.V(4).Infof("Number of local SSDs requested for Data Cache: %v", dataCacheCount)
250247
return dataCacheCount, nil
251248
}
252-
return 0, fmt.Errorf("Cannot get Datacache's LSSD count from node label")
249+
// This will be returned for a non-Data-Cache node pool
250+
return 0, nil
253251
}
254252

255253
func FetchRaidedLssdCountForDatacache() (int, error) {
254+
raidedPath, err := fetchRAIDedLocalSsdPath()
255+
if err != nil {
256+
return 0, err
257+
}
256258
args := []string{
257259
"--detail",
258-
initialRaidedLocalSsdPath,
260+
raidedPath,
259261
}
260262
info, err := common.RunCommand("grep", []string{"Raid Devices"}, "mdadm", args...)
261263
if err != nil {
262-
return 0, fmt.Errorf("Error getting RAIDed devices for Datacache")
264+
return 0, fmt.Errorf("Error getting RAIDed devices for Data Cache")
263265
}
264266
if len(info) != 0 {
265267
raidedDeviceInfo := strings.Split(strings.TrimSpace(string(info)), ":")
@@ -294,7 +296,7 @@ func FetchRaidedLssds() ([]string, error) {
294296
}
295297
}
296298

297-
klog.V(2).Infof("Raided NVME list %v", raidedLssdList)
299+
klog.V(4).Infof("Raided NVME list %v", raidedLssdList)
298300

299301
return raidedLssdList, nil
300302
}
@@ -309,7 +311,7 @@ func FetchAllLssds() ([]string, error) {
309311
infoList := strings.Split(strings.TrimSpace(string(info)), "\n")
310312
re, err := regexp.Compile("nvme_card([0-9]+)?$")
311313
if err != nil {
312-
klog.V(2).ErrorS(err, "Errored while compiling to check PD or LSSD")
314+
klog.V(4).ErrorS(err, "Errored while compiling to check PD or LSSD")
313315
}
314316
for _, ssd := range infoList {
315317
ssd = strings.TrimSpace(ssd)
@@ -322,7 +324,7 @@ func FetchAllLssds() ([]string, error) {
322324
}
323325
}
324326

325-
klog.V(2).Infof("NVME list %v", diskList)
327+
klog.V(4).Infof("NVME list %v", diskList)
326328

327329
return diskList, nil
328330
}
@@ -358,6 +360,7 @@ func cleanupCache(volumeId string, nodeId string) error {
358360
// If volume group doesn't exist then there's nothing to uncache
359361
return nil
360362
}
363+
reduceVolumeGroup(volumeGroupName, true)
361364
mainLvName := getLvName(mainLvSuffix, volumeId)
362365
args := []string{
363366
"-an",
@@ -404,7 +407,7 @@ func createVg(volumeGroupName string, devicePath string, raidedLocalSsds string)
404407
if err != nil {
405408
return fmt.Errorf("Volume group creation failed %w: %s", err, info)
406409
}
407-
klog.Infof("Volume group creation succeeded for %v", volumeGroupName)
410+
klog.V(4).Infof("Volume group creation succeeded for %v", volumeGroupName)
408411

409412
args = []string{}
410413
info, err = common.RunCommand("" /* pipedCmd */, nil /* pipedCmdArg */, "vgscan", args...)
@@ -431,8 +434,6 @@ func reduceVolumeGroup(volumeGroupName string, force bool) {
431434
func RaidLocalSsds(availableLssds []string) error {
432435
args := []string{
433436
"--create",
434-
initialRaidedLocalSsdPath,
435-
"--name",
436437
raidedLocalSsdName,
437438
"-l" + raidMode,
438439
// Force RAIDing as sometime it might fail for caution if there is just 1 LSSD present as 1 LSSD need not be RAIDed
@@ -448,7 +449,7 @@ func RaidLocalSsds(availableLssds []string) error {
448449
// Validate if Raided successfully
449450
isAlreadyRaided, err := IsRaided()
450451
if err != nil {
451-
klog.V(2).Infof("Errored while scanning for available raided LocalSSDs err:%v=", err)
452+
klog.V(4).Infof("Errored while scanning for available raided LocalSSDs err:%v=", err)
452453
}
453454
if !isAlreadyRaided {
454455
return fmt.Errorf("failed raiding, raided device not found on scanning")

pkg/gce-pd-csi-driver/node.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
347347
}
348348
configError := ValidateDataCacheConfig(req.GetPublishContext()[common.ContextDataCacheMode], req.GetPublishContext()[common.ContextDataCacheSize], ctx, nodeId)
349349
if configError != nil {
350-
return nil, status.Error(codes.Internal, fmt.Sprintf("Error validate configuration for Datacache: %v", err.Error()))
350+
return nil, status.Error(codes.Internal, fmt.Sprintf("Error validate configuration for Data Cache: %v", err.Error()))
351351
}
352352
devicePath, err = setupCaching(devFsPath, req, nodeId)
353353
if err != nil {

test/e2e/utils/utils.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,11 @@ func GCEClientAndDriverSetup(instance *remote.InstanceInfo, driverConfig DriverC
7171
"--allow-hdha-provisioning",
7272
"--device-in-use-timeout=10s", // Set lower than the usual value to expedite tests
7373
fmt.Sprintf("--fallback-requisite-zones=%s", strings.Join(driverConfig.Zones, ",")),
74-
"--enable-data-cache",
75-
fmt.Sprintf("--node-name=%s", utilcommon.TestNode),
74+
}
75+
76+
if instance.GetLocalSSD() > 0 {
77+
extra_flags = append(extra_flags, "--enable-data-cache")
78+
extra_flags = append(extra_flags, fmt.Sprintf("--node-name=%s", utilcommon.TestNode))
7679
}
7780
extra_flags = append(extra_flags, fmt.Sprintf("--compute-endpoint=%s", driverConfig.ComputeEndpoint))
7881
extra_flags = append(extra_flags, driverConfig.ExtraFlags...)

test/remote/instance.go

+4
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (i *InstanceInfo) GetNodeID() string {
8080
return common.CreateNodeID(i.cfg.Project, i.cfg.Zone, i.cfg.Name)
8181
}
8282

83+
func (i *InstanceInfo) GetLocalSSD() int64 {
84+
return i.cfg.LocalSSDCount
85+
}
86+
8387
func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) bool {
8488
if !strings.Contains(curInst.MachineType, newInst.MachineType) {
8589
klog.Infof("Machine type mismatch")

0 commit comments

Comments
 (0)