Skip to content

Commit aad78f9

Browse files
committed
Implementing watcher & reboot stability for data cache to master branch.
1 parent 39a5910 commit aad78f9

35 files changed

+4668
-18
lines changed

cmd/gce-pd-csi-driver/main.go

+40-3
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,12 @@ const (
105105
driverName = "pd.csi.storage.gke.io"
106106
dataCacheLabel = "datacache-storage-gke-io"
107107
dataCacheLabelValue = "enabled"
108+
raidedLocalSsdName = "csi-driver-data-cache"
109+
raidedLssdPrefix = "/dev/md"
108110
)
109111

112+
var raidedLocalSsdPath = raidedLssdPrefix + raidedLocalSsdName
113+
110114
func init() {
111115
// klog verbosity guide for this package
112116
// Use V(2) for one time config information
@@ -266,9 +270,10 @@ func handle() {
266270
if nodeName == nil || *nodeName == "" {
267271
klog.Errorf("Data cache enabled, but --node-name not passed")
268272
}
269-
if err := setupDataCache(ctx, *nodeName); err != nil {
273+
if err := setupDataCache(ctx, *nodeName, nodeServer.MetadataService.GetName()); err != nil {
270274
klog.Errorf("DataCache setup failed: %v", err)
271275
}
276+
go driver.StartWatcher(*nodeName)
272277
}
273278

274279
err = gceDriver.SetupGCEDriver(driverName, version, extraVolumeLabels, extraTags, identityServer, controllerServer, nodeServer)
@@ -350,7 +355,7 @@ func urlFlag(target **url.URL, name string, usage string) {
350355
})
351356
}
352357

353-
func setupDataCache(ctx context.Context, nodeName string) error {
358+
func setupDataCache(ctx context.Context, nodeName string, nodeId string) error {
354359
klog.V(2).Infof("Setting up data cache for node %s", nodeName)
355360
if nodeName != common.TestNode {
356361
cfg, err := rest.InClusterConfig()
@@ -373,7 +378,39 @@ func setupDataCache(ctx context.Context, nodeName string) error {
373378
}
374379
klog.V(2).Info("Raiding local ssds to setup data cache")
375380
if err := driver.RaidLocalSsds(); err != nil {
376-
return fmt.Errorf("Failed to Raid local SSDs, unable to setup data caching, got error %v", err)
381+
return fmt.Errorf("failed to Raid local SSDs, unable to setup data caching, got error %v", err)
382+
}
383+
384+
// vgcreate with the raided local ssds.
385+
info, err := common.RunCommand("grep", raidedLocalSsdName, "ls", raidedLssdPrefix)
386+
if err != nil {
387+
return fmt.Errorf("failed while listing raided devices, err: %v, output: %v", err, info)
388+
}
389+
infoString := strings.TrimSpace(string(info))
390+
klog.V(2).Infof("Got Raided LSSD name %v", infoString)
391+
raidedLocalSsdPath = raidedLssdPrefix + infoString
392+
volumeGroupName := driver.GetVolumeGroupName(nodeId)
393+
394+
klog.V(2).Infof("vgscan before vgcreate")
395+
vgExists := driver.CheckVgExists(volumeGroupName)
396+
klog.V(2).Infof("vgscan info contains volumeGroupName or not %v", vgExists)
397+
// Check if the required volume group already exists
398+
if vgExists {
399+
klog.V(2).Infof("VG exists, now check if PD is part of VG")
400+
401+
// Clean up Volume Group before adding the PD
402+
driver.ReduceVolumeGroup(volumeGroupName, true)
403+
404+
// validate that raidedLSSD is part of VG
405+
err = driver.ValidateRaidedLSSDinVG(volumeGroupName)
406+
if err != nil {
407+
return fmt.Errorf("validateRaidedLSSDinVG error: %v", err)
408+
}
409+
} else {
410+
err := driver.CreateVg(volumeGroupName, raidedLocalSsdPath)
411+
if err != nil {
412+
return err
413+
}
377414
}
378415

379416
klog.V(2).Infof("Datacache enabled for node %s", nodeName)

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ require (
5858
github.com/davecgh/go-spew v1.1.1 // indirect
5959
github.com/emicklei/go-restful v2.9.5+incompatible // indirect
6060
github.com/felixge/httpsnoop v1.0.4 // indirect
61-
github.com/fsnotify/fsnotify v1.5.4 // indirect
61+
github.com/fsnotify/fsnotify v1.8.0 // indirect
6262
github.com/go-logr/logr v1.4.2 // indirect
6363
github.com/go-logr/stdr v1.2.2 // indirect
6464
github.com/go-openapi/jsonpointer v0.20.0 // indirect

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,8 @@ github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4
10321032
github.com/fsnotify/fsnotify v1.5.1/go.mod h1:T3375wBYaZdLLcVNkcVbzGHY7f1l/uK5T5Ai1i3InKU=
10331033
github.com/fsnotify/fsnotify v1.5.4 h1:jRbGcIw6P2Meqdwuo0H1p6JVLbL5DHKAKlYndzMwVZI=
10341034
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
1035+
github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
1036+
github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
10351037
github.com/fsouza/fake-gcs-server v0.0.0-20180612165233-e85be23bdaa8/go.mod h1:1/HufuJ+eaDf4KTnYdS6HJMGvMRU8d4cYTuu/1QaBbI=
10361038
github.com/fsouza/fake-gcs-server v1.19.4/go.mod h1:I0/88nHCASqJJ5M7zVF0zKODkYTcuXFW5J5yajsNJnE=
10371039
github.com/fvbommel/sortorder v1.0.1/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=

pkg/gce-pd-csi-driver/cache.go

+75-12
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"strings"
88

99
csi "github.com/container-storage-interface/spec/lib/go/csi"
10+
fsnotify "github.com/fsnotify/fsnotify"
1011

1112
"k8s.io/klog/v2"
1213

@@ -25,7 +26,7 @@ var raidedLocalSsdPath = raidedLssdPrefix + raidedLocalSsdName
2526

2627
func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId string) (string, error) {
2728
volumeId := req.GetVolumeId()
28-
volumeGroupName := getVolumeGroupName(nodeId)
29+
volumeGroupName := GetVolumeGroupName(nodeId)
2930
mainDevicePath := "/dev/" + volumeGroupName + "/" + getLvName(mainLvSuffix, volumeId)
3031
mainLvName := getLvName(mainLvSuffix, volumeId)
3132
klog.V(2).Infof("Volume group available on node %v ", volumeGroupName)
@@ -37,12 +38,12 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
3738
infoString := strings.TrimSpace(string(info))
3839
raidedLocalSsdPath = raidedLssdPrefix + infoString
3940

40-
vgExists := checkVgExists(volumeGroupName)
41+
vgExists := CheckVgExists(volumeGroupName)
4142
if vgExists {
4243
// Clean up Volume Group before adding the PD
43-
reduceVolumeGroup(volumeGroupName, true)
44+
ReduceVolumeGroup(volumeGroupName, true)
4445
} else {
45-
err := createVg(volumeGroupName, devicePath, raidedLocalSsdPath)
46+
err := CreateVg(volumeGroupName, raidedLocalSsdPath)
4647
if err != nil {
4748
return mainDevicePath, err
4849
}
@@ -75,7 +76,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
7576
klog.Errorf("Errored while deactivating VG %v: err: %v: %s", vgNameForPv, err, info)
7677
}
7778
// CLean up volume group to remove any dangling PV refrences
78-
reduceVolumeGroup(vgNameForPv, false)
79+
ReduceVolumeGroup(vgNameForPv, false)
7980
_, isCached := isCachingSetup(mainLvName)
8081
// We will continue to uncache even if it errors to check caching as it is not a terminal issue.
8182
if isCached {
@@ -91,7 +92,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
9192
return "", fmt.Errorf("errored while uncaching main LV. %v: %s", err, info)
9293
}
9394
// CLean up volume group to remove any dangling PV refrences
94-
reduceVolumeGroup(vgNameForPv, false)
95+
ReduceVolumeGroup(vgNameForPv, false)
9596
}
9697
info, err = common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "vgmerge", []string{volumeGroupName, vgNameForPv}...)
9798
if err != nil {
@@ -189,7 +190,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
189190
return mainDevicePath, nil
190191
}
191192

192-
func checkVgExists(volumeGroupName string) bool {
193+
func CheckVgExists(volumeGroupName string) bool {
193194
args := []string{}
194195
info, err := common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "vgscan", args...)
195196
if err != nil {
@@ -202,8 +203,8 @@ func checkVgExists(volumeGroupName string) bool {
202203

203204
func cleanupCache(volumeId string, nodeId string) error {
204205

205-
volumeGroupName := getVolumeGroupName(nodeId)
206-
if !checkVgExists(volumeGroupName) {
206+
volumeGroupName := GetVolumeGroupName(nodeId)
207+
if !CheckVgExists(volumeGroupName) {
207208
// If volume group doesn't exist then there's nothing to uncache
208209
return nil
209210
}
@@ -228,7 +229,7 @@ func cleanupCache(volumeId string, nodeId string) error {
228229
return nil
229230
}
230231

231-
func getVolumeGroupName(nodePath string) string {
232+
func GetVolumeGroupName(nodePath string) string {
232233
nodeSlice := strings.Split(nodePath, "/")
233234
nodeId := nodeSlice[len(nodeSlice)-1]
234235
nodeHash := common.ShortString(nodeId)
@@ -241,7 +242,7 @@ func getLvName(suffix string, volumeId string) string {
241242
return fmt.Sprintf("%s-%s", suffix, pvcName)
242243
}
243244

244-
func createVg(volumeGroupName string, devicePath string, raidedLocalSsds string) error {
245+
func CreateVg(volumeGroupName string, raidedLocalSsds string) error {
245246
args := []string{
246247
"--zero",
247248
"y",
@@ -263,7 +264,7 @@ func createVg(volumeGroupName string, devicePath string, raidedLocalSsds string)
263264
return nil
264265
}
265266

266-
func reduceVolumeGroup(volumeGroupName string, force bool) {
267+
func ReduceVolumeGroup(volumeGroupName string, force bool) {
267268
args := []string{
268269
"--removemissing",
269270
volumeGroupName,
@@ -366,3 +367,65 @@ func isCachingSetup(mainLvName string) (error, bool) {
366367
}
367368
return nil, false
368369
}
370+
371+
func StartWatcher(nodeName string) {
372+
dirToWatch := "/dev/"
373+
watcher, err := fsnotify.NewWatcher()
374+
if err != nil {
375+
klog.V(2).ErrorS(err, "errored while creating watcher")
376+
}
377+
defer watcher.Close()
378+
379+
// out of the box fsnotify can watch a single file, or a single directory
380+
if err := watcher.Add(dirToWatch); err != nil {
381+
klog.V(2).ErrorS(err, "errored while adding watcher directory")
382+
}
383+
errorCh := make(chan error, 1)
384+
// Handle the error received from the watcher goroutine
385+
go watchDiskDetaches(watcher, nodeName, errorCh)
386+
387+
select {
388+
case err := <-errorCh:
389+
klog.Errorf("watcher encountered an error: %v", err)
390+
}
391+
}
392+
393+
func watchDiskDetaches(watcher *fsnotify.Watcher, nodeName string, errorCh chan error) error {
394+
for {
395+
select {
396+
// watch for errors
397+
case err := <-watcher.Errors:
398+
errorCh <- fmt.Errorf("disk update event errored: %v", err)
399+
// watch for events
400+
case event := <-watcher.Events:
401+
// In case of an event i.e. creation or deletion of any new PV, we update the VG metadata.
402+
// This might include some non-LVM changes, no harm in updating metadata multiple times.
403+
ReduceVolumeGroup(GetVolumeGroupName(nodeName), true)
404+
klog.V(2).Infof("disk attach/detach event %#v\n", event)
405+
}
406+
}
407+
}
408+
409+
func ValidateRaidedLSSDinVG(vgName string) error {
410+
args := []string{
411+
"--noheadings",
412+
"-o",
413+
"pv_name",
414+
"--select",
415+
"vg_name=" + vgName,
416+
}
417+
info, err := common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "pvs", args...)
418+
if err != nil {
419+
return fmt.Errorf("errored while checking physical volume details %v: %s", err, info)
420+
// On error info contains the error message which we cannot use for further steps
421+
}
422+
423+
klog.V(2).Infof("Got PVs %v in VG %v", strings.TrimSpace(string(info)), vgName)
424+
if !strings.Contains(string(info), "/dev/md127") {
425+
info, err := common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "vgextend", []string{vgName, "/dev/md127"}...)
426+
if err != nil {
427+
klog.Errorf("errored while extending VGs %v: %s", err, info)
428+
}
429+
}
430+
return nil
431+
}

vendor/github.com/fsnotify/fsnotify/.cirrus.yml

+14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/fsnotify/fsnotify/.gitignore

+10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/fsnotify/fsnotify/.mailmap

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)